Szerkesztő:Tgr/huflex
-- inflection and other language processing functions for Hungarian
local Huflex = {}
local sf, utils = require('Module:StringFunctions'), require('Module:Utils')
local vowels = {}
vowels.back = Set {'a', 'á', 'o', 'ó', 'u', 'ú'}
vowels.frontIllabial = Set {'e', 'é', 'i', 'í'}
vowels.frontLabial = Set {'ö', 'ő', 'ü', 'ű'}
vowels.front = vowels.frontIllabial + vowels.frontLabial
vowels.all = vowels.front + vowels.back
vowels.shortToLong = {['a'] = 'á', ['e'] = 'é', ['i'] = 'í', ['o'] = 'ó', ['ö'] = 'ő', ['u'] = 'ú', ['ü'] = 'ű'}
local digraphs = {}
digraphs.short = Set {'cs', 'dz', 'gy', 'ly', 'ny', 'sz', 'zs'}
digraphs.long = Set {'ccs', 'ddz', 'ggy', 'lly', 'nny', 'ssz', 'zzs'}
function digraphs.shortToLong(digraph)
return digraph:sub(1, 1) .. digraph
end
function Huflex.vowelHarmony(word)
-- returns two values: vowel harmony ('front', 'back' or 'mixed') and last vowel
if word:sub(-4) == 'wiki' then
-- the last part of a composite word determines its vowel harmony; there is no way to handle this generally,
-- but sitenames ending with 'wiki' are frequent enough to handle as a special case
return 'front', 'i'
end
local vowelHarmonyBack, vowelHarmonyFront = false, false
for c in sf.split(word) do
if c == ' ' or c == '-' then -- only the last word counts; start again
vowelHarmonyBack, vowelHarmonyFront = false, false
elseif vowels.all:has(c) then
lastVowel = c
if vowels.back:has(c) then
vowelHarmonyBack = true
else
vowelHarmonyFront = true
end
end
end
if vowelHarmonyFront and vowelHarmonyBack then
vowelHarmony = 'mixed'
elseif vowelHarmonyFront then
vowelHarmony = 'front'
else
vowelHarmony = 'back'
end
return vowelHarmony, lastVowel
end
-- Selects the matching suffix based on vowel harminy
function Huflex.selectSuffix(vowelHarmony, lastVowel, suffix1, suffix2, suffix3)
if suffix2 == nil then -- trivial case, suffix has only one form
return suffix1
elseif vowelHarmony == 'back' then
return suffix1
elseif vowelHarmony == 'front' then
if suffix3 ~= nil and vowels.frontLabial:has(lastVowel) then
return suffix3
else
return suffix2
end
elseif vowelHarmony == 'mixed' then
if vowelsBack:has(lastVowel) or vowels.frontIllabial:has(lastVowel) then
return suffix1
else -- vowelsFrontLabial:has(lastVowel)
if suffix3 ~= nil then
return suffix3
else
return suffix2
end
end
else
fail() -- invalid value for vowelHarmony
end
end
-- Combines word with suffix according to Hungarian grammar (vowel harmony + assimilation). Far from perfect, but.
-- should work with the suffixes following {{SITENAME}} in the interface messages, unless sitename is some tricky.
-- composite or foreign word.
-- addStuff does three things:
-- 1) select the suffix with matching vowel harmony (first parameter should be back, second front,
-- third rounded (labial); second and third might be omitted if the suffix has less forms).
-- 2) if the last letter of the word is 'a', 'e' or 'o', change it to 'á', 'é' or 'ó' respectively.
-- 3) if the first letter of the suffix is 'v', change it according to assimilation rules. (This can get
-- complicated if the last letter of the word is a digraph/trigraph or a double consonant.)
-- TODO http://www.szabogabor.net/2011/09/27/val-vel-rag-generalasa-szohoz/
function Huflex.addSuffix(frame)
local word, suffix1, suffix2, suffix3 = unpackFrame(frame, 1, 2, 3, 4)
word, suffix1, suffix2, suffix3 = unpack(map(sf.trim, {word, suffix1, suffix2, suffix3}))
local vowelHarmony, lastVowel = Huflex.vowelHarmony(word)
local suffix = Huflex.selectSuffix(vowelHarmony, lastVowel, suffix1, suffix2, suffix3)
-- change end of word: a -> á, e -> é, o -> o
if Set({'a', 'e', 'o'}):has(sf.sub(word, -1)) then
word = sf.sub(word, 1, -2) .. vowels.shortToLong[sf.sub(word, -1)]
end
-- change start of suffix: v assimilates if the word ends with a consonant
if sf.sub(suffix, 1, 1) == 'v' and not vowels.all:has(sf.sub(word, -1)) then
if sf.sub(word, -2, -2) == sf.sub(word, -1) or digraphs.long:has(sf.sub(word, -3)) then -- long consonant, does not get any longer
suffix = sf.sub(suffix, 2)
elseif digraphs.short:has(sf.sub(word, -2)) then -- ends with short digraph
suffix = sf.sub(suffix, 2)
word = sf.sub(word, 1, -3) .. digraphs.shortToLong(sf.sub(word, -2))
else -- single character, will become double now
suffix = sf.sub(word, -1) .. sf.sub(suffix, 2)
end
-- leave out first character of the suffix if its a vowel and the word also ends with a vowel
elseif vowels.all:has(sf.sub(word, -1)) and vowels.all:has(sf.sub(suffix, 1, 1)) then
suffix = sf.sub(suffix, 2)
end
return word .. suffix
--]]--
end
return Huflex
Utils:
-- utility functions (not namespaced!)
-- maps all elements of an array through a function
function map(func, array)
local new_array = {}
for k,v in pairs(array) do
new_array[k] = func(v)
end
return new_array
end
-- turns a frame object into a table
function unpackFrame(frame, ...)
local arguments = {}
for _, v in ipairs(arg) do
arguments[v] = frame.args[v]
end
return unpack(arguments)
end
-- returns a new table which is the reverse of the original
function table.reverse(tbl)
local newTable = {}
for k,v in pairs(tbl) do
newTable[v] = k
end
return newTable
end
-- returns a new table created by merging two tables
-- the second table overwrites the first for equal keys
function table.merge(tbl1, tbl2)
local newTable = {}
for k,v in pairs(tbl1) do
newTable[k] = v
end
for k,v in pairs(tbl2) do
newTable[k] = v
end
return newTable
end
function table.dump(tbl, recursive)
if type(tbl) ~= 'table' then
return '<not a table!>'
end
local s = '{ '
for k,v in pairs(tbl) do
if type(k) ~= 'number' then
k = '"'..k..'"'
end
if type(v) == 'table' then
if recursive then
v = table.dump(v, true)
else
v = 'table'
end
elseif type(v) == 'boolean' then -- no boolean -> string autoconversion in Lua
if v then
v = 'true'
else
v = 'false'
end
end
s = s .. '['..k..'] = ' .. v .. ','
end
return s .. '} '
end
SetMeta = {
__index = {
has = function(tbl, key)
return tbl[key] ~= nil
end
},
__add = function(set1, set2)
local set = table.merge(set1, set2)
setmetatable(set, SetMeta)
return set
end,
}
function Set(list)
local set = {}
setmetatable(set, SetMeta)
for _, l in ipairs(list) do
set[l] = true
end
return set
end
local Test = {
testMerge = function(frame)
local a,b,c,d,e,f = unpackFrame(frame, 1, 2, 3, 4, 5, 6)
local s1, s2, s3 = Set{a,b,c}, Set{d,e,f}, Set{a,b,e,f}
return table.dump(s1 + s2) .. '|' .. table.dump(s1 + s3)
end
}
return Test
StringFunctions:
-- basic string manipulation functions
local StringFunctions = {}
function StringFunctions.trim(str)
if str.args then
str = str.args[1]
end
if str == nil then
return nil
else
return (str:gsub("^%s*(.-)%s*$", "%1")) -- extra brackets are necessary because gsub returns multiple values
end
end
-- splits a string into (Unicode) characters
-- returns an iterator
-- behavior is undefined for input which is invalid UTF-8
function StringFunctions.split(str)
local i, error = 1, false
return function()
if error then
return nil
end
local byte = str:byte(i)
if byte == nil then
return nil
end
-- determine number of 1 bits before the first 0 in byte
local leadBits, bitValue, remainder = 0, 128, byte
while bitValue <= remainder and bitValue > 1 do
leadBits = leadBits + 1
remainder = remainder - bitValue
bitValue = bitValue / 2
end
local chr, length -- the next UTF-8 character and its length in bytes
if leadBits == 0 then -- ASCII character
length = 1
elseif leadBits == 1 or leadBits > 6 then -- not valid UTF-8
error = true
return '<error(' + i + ':' + byte + ')>'
else
length = leadBits
end
chr = str:sub(i, i + length - 1)
i = i + length
return chr
end
end
-- UTF-8 aware version of string:len
function StringFunctions.len(str)
local i = 0
for c in StringFunctions.split(str) do
i = i + 1
end
return i
end
-- UTF-8 aware version of string:sub
function StringFunctions.sub(str, i, j)
if i < 0 then
i = StringFunctions.len(str) + i + 1
end
if j and j < 0 then
j = StringFunctions.len(str) + j + 1
end
local pos, substr = 0, ''
for c in StringFunctions.split(str) do
pos = pos + 1
if pos >= i and (not j or pos <= j) then
substr = substr .. c
end
end
return substr
end
return StringFunctions