Modul:Ancient Greek
local p = {}
local macron = mw.ustring.char(0x304)
local breve = mw.ustring.char(0x306)
local rough = mw.ustring.char(0x314)
local smooth = mw.ustring.char(0x313)
local diaeresis = mw.ustring.char(0x308)
local acute = mw.ustring.char(0x301)
local grave = mw.ustring.char(0x300)
local circumflex = mw.ustring.char(0x342)
local Latin_circumflex = mw.ustring.char(0x302)
local subscript = mw.ustring.char(0x345)
local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflex
local is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, }
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ
local info = {}
-- The tables are shared among different characters so that they can be checked
-- for equality if needed, and to use less space.
local vowel = { vowel = true, diacritic_seat = true }
local iota = { vowel = true, diacritic_seat = true, offglide = true }
local upsilon = { vowel = true, diacritic_seat = true, offglide = true }
-- Technically rho is only a seat for rough or smooth breathing.
local rho = { consonant = true, diacritic_seat = true }
local consonant = { consonant = true }
local diacritic = { diacritic = true }
-- Needed for equality comparisons.
local breathing = { diacritic = true }
local function add_info(characters, t)
if type(characters) == "string" then
for character in string.gmatch(characters, UTF8_char) do
info[character] = t
end
else
for _, character in ipairs(characters) do
info[character] = t
end
end
end
add_info({ macron, breve,
diaeresis,
acute, grave, circumflex,
subscript,
}, diacritic)
add_info({rough, smooth}, breathing)
add_info("ΑΕΗΟΩαεηοω", vowel)
add_info("Ιι", iota)
add_info("Υυ", upsilon)
add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant)
add_info("Ρρ", rho)
local not_recognized = {}
setmetatable(info, { __index =
function()
return not_recognized
end
})
local function quote(str)
return "“" .. str .. "”"
end
local correspondences = {
-- Vowels
["α"] = "a",
["ε"] = "e",
["η"] = "e" .. macron,
["ι"] = "i",
["ο"] = "o",
["υ"] = "u",
["ω"] = "o" .. macron,
-- Consonants
["β"] = "b",
["γ"] = "g",
["δ"] = "d",
["ζ"] = "z",
["θ"] = "th",
["κ"] = "k",
["λ"] = "l",
["μ"] = "m",
["ν"] = "n",
["ξ"] = "x",
["π"] = "p",
["ρ"] = "r",
["σ"] = "s",
["ς"] = "s",
["τ"] = "t",
["φ"] = "ph",
["ψ"] = "ps",
-- Archaic letters
["ϝ"] = "w",
["ϻ"] = "ś",
["ϙ"] = "q",
["ϡ"] = "š",
["ͷ"] = "v",
-- Diacritics
[smooth] = '',
[rough] = '', -- h is added below in the `transliterate` function.
[breve] = '',
}
local ALA_LC = {
["χ"] = "ch",
[acute] = '',
[grave] = '',
[circumflex] = '',
[subscript] = '',
[diaeresis] = '',
[macron] = '',
}
local Wiktionary_transliteration = {
["χ"] = "kh",
[circumflex] = Latin_circumflex,
[subscript] = 'i',
}
local function add_index_metamethod(t, index_metamethod)
local mt = getmetatable(t)
if not mt then
mt = {}
setmetatable(t, mt)
end
mt.__index = index_metamethod
end
--[=[
This breaks a word into meaningful "tokens", which are
individual letters or diphthongs with their diacritics.
Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function tokenize(text)
local tokens, vowel_info, prev_info = {}, {}, {}
local token_i = 1
local prev
for character in string.gmatch(mw.ustring.toNFD(text), UTF8_char) do
local curr_info = info[character]
-- Split vowels between tokens if not a diphthong.
if curr_info.vowel then
if prev and (not (curr_info.offglide and prev_info.vowel)
-- υυ → υ, υ
-- ιυ → ι, υ
or prev_info.offglide and curr_info == upsilon) then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
table.insert(vowel_info, { index = token_i })
elseif curr_info.diacritic then
tokens[token_i] = (tokens[token_i] or "") .. character
if prev_info.vowel or prev_info.diacritic then
if character == diaeresis then
-- Current token is vowel, vowel, possibly other diacritics,
-- and a diaeresis.
-- Split the current token into two:
-- the first letter, then the second letter plus any diacritics.
local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
if previous_vowel then
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
token_i = token_i + 1
end
end
elseif prev_info == rho then
if curr_info ~= breathing then
return string.format("The character %s cannot have the accent %s on it.", prev, "◌" .. character)
end
else
error("The character " .. quote(prev) .. " cannot have a diacritic on it.")
end
elseif curr_info == rho then
if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
else
if prev then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
end
prev = character
prev_info = curr_info
end
return tokens
end
function p.transliterate(text, system)
add_index_metamethod(correspondences, system == "ALA-LC" and ALA_LC or Wiktionary_transliteration)
if text == '῾' then
return 'h'
end
text = mw.ustring.toNFD(text)
--[[
Replace semicolon or Greek question mark with regular question mark,
except after an ASCII alphanumeric character (to avoid converting
semicolons in HTML entities).
--]]
text = mw.ustring.gsub(text, "([^A-Za-z0-9])[;" .. mw.ustring.char(0x37E) .. "]", "%1?")
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
text = text:gsub("·", ";")
local tokens = tokenize(text)
--now read the tokens
local output = {}
for i, token in pairs(tokens) do
-- substitute each character in the token for its transliteration
local translit = string.gsub(mw.ustring.lower(token), UTF8_char, correspondences)
if token == 'γ' and is_velar[tokens[i + 1]] then
-- γ before a velar should be <n>
translit = 'n'
elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
-- ρ after ρ should be <rh>
translit = 'rh'
elseif system == "Wiktionary" and mw.ustring.find(token, '^[αΑ].*' .. subscript .. '$') then
-- add macron to ᾳ
translit = mw.ustring.gsub(translit, '([aA])', '%1' .. macron)
end
if token:find(rough) then
if mw.ustring.find(token, '[Ρρ]') then
translit = translit .. 'h'
else -- vowel
translit = 'h' .. translit
end
end
if system == "ALA-LC" and mw.ustring.find(token, '^[υΥ][^ιΙ]*$') then
translit = translit:gsub('u', 'y'):gsub('U', 'Y')
end
-- Remove macron from a vowel that has a circumflex.
if mw.ustring.find(translit, macron_circumflex) then
translit = translit:gsub(macron, '')
end
-- Capitalize first character of transliteration.
if token ~= mw.ustring.lower(token) then
translit = mw.ustring.gsub(translit, "^.", mw.ustring.upper)
end
table.insert(output, translit)
end
return table.concat(output)
end
function p.translit(frame)
local text = frame.args[1] or frame:getParent().args[1]
local system = frame.args.system
if system == nil or system == "" then
system = "Wiktionary"
elseif not (system == "ALA-LC" or system == "Wiktionary") then
error('Transliteration system in |system= not recognized; choose between "ALA-LC" and "Wiktionary"')
end
local transliteration = p.transliterate(text, system)
return '<span title="Ancient Greek transliteration" lang="grc-Latn"><i>' .. transliteration .. '</i></span>'
end
function p.bare_translit(frame)
return p.transliterate(frame.args[1] or frame:getParent().args[1])
end
return p