Module:Hrkt-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate text in the Kana script. It is used to transliterate Southern Amami Ōshima, Japanese, Hachijō, Kikai, Miyako, Okinoerabu, Northern Amami Ōshima, Yaeyama, Okinawan, Tokunoshima, Kunigami, Yonaguni, and Yoron. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:Hrkt-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local concat = table.concat
local insert = table.insert
local load_data = mw.loadData
local toNFD = mw.ustring.toNFD
local umatch = mw.ustring.match

local m_ja = require("Module:ja")
local kata_to_hira = m_ja.kata_to_hira
local normalize_kana = m_ja.normalize_kana

local data_common
local glottal = "\1"
local disambig = "\2"
local cons = "b-df-hj-np-tvxz"

local export = {}

local function get_initial(text)
	return umatch(text, "(.+)%f[" .. umatch(text, ".$") .. "]") or text
end

local function handle_initials(data, d_voicing, d_semivoicing, initials, checked)
	if not data then
		return
	end
	for k, v in pairs(data) do
		if not checked[k] and umatch(v, "^%a+$") then
			local initial = get_initial(v)
			if initial:match("^[" .. cons .. "]+$") then
				initials[initial] = true
			end
			local v_initial, sv_initial = d_voicing[initial], d_semivoicing[initial]
			if v_initial and v_initial:match("^[" .. cons .. "]+$") then
				initials[v_initial] = true
			end
			if sv_initial and sv_initial:match("^[" .. cons .. "]+$") then
				initials[sv_initial] = true
			end
		end
		checked[k] = true
	end
end

function export.process_data(data, common)
	local initials, checked, d_voicing, d_semivoicing = {}, {}, data.tr_voicing, data.tr_semivoicing
	data.initials = initials
	if not common then
		data_common = data_common or load_data("Module:Hrkt-translit/data")
		d_voicing = d_voicing or data_common.tr_voicing
		d_semivoicing = d_semivoicing or data_common.tr_semivoicing
	end
	handle_initials(data.rom, d_voicing, d_semivoicing, initials, checked)
	if not common then
		handle_initials(data_common.rom, d_voicing, d_semivoicing, initials, checked)
	end
	return data
end

local function get_data(lang)
	data_common = data_common or load_data("Module:Hrkt-translit/data")
	local function inspect_table(t, ...)
		for i = 1, select("#", ...) do
			if type(t) == "table" then
				t = t[select(i, ...)]
			else return nil end
		end
		return t
	end
	if lang then
		local name_data = "Module:Hrkt-translit/data/" .. lang
		if package.loaders[2](name_data) then
			local data_lang = load_data(name_data)
			return function(...)
				local item_lang, item_common = data_lang[...], data_common[...]
				for i = 2, select("#", ...) do 
					local key = select(i, ...)
					if type(item_lang) == "table" then
						item_lang = item_lang[key]
					else return inspect_table(item_common, select(i, ...)) end
					if type(item_common) == "table" then
						item_common = item_common[key]
					else return inspect_table(item_lang, select(i + 1, ...)) end
				end
				if item_lang ~= nil then return item_lang else return item_common end
			end
		end
	end
	return function(...)
		return inspect_table(data_common[...], select(2, ...))
	end
end

local function do_voicing(i_last, result, result_sp, hist, d, key)
	local text = result[i_last]
	if not hist and result_sp[i_last] == "historical w" then
		text = "w" .. text
	end
	return text:gsub("^" .. get_initial(text), d(key))
end

function export.tr(text, lang, sc, options)
	if umatch(text, "[" .. mw.loadData("Module:ja/data/range").kanji .. "]") then
		require("Module:debug").track("ja/invalid Hrkt")
	end

	options = options or {}
	
	local result = {[0] = ""}
	local result_sp = {}
	
	local d = get_data(lang)
	
	local function getlast(i_start, predicate_good, predicate_bad)
		local in_xml = false
		for i = i_start or #result, 1, -1 do
			if in_xml then
				if result[i] == "<" then in_xml = false end
			elseif result[i] == ">" then
				in_xml = true
			else
				if (predicate_bad or function(index)
					return result_sp[index] == "stop"
				end)(i) then break end
				if (predicate_good or function(index)
					return result[index]:len() > 0 and result_sp[index] ~= "'"
				end)(i) then return i end
			end
		end
		return 0
	end
	
	-- normalize long vowels and iteration marks
	text = toNFD(kata_to_hira(normalize_kana(text)))
	
	for c in text:gmatch(".[\128-\191]*") do
		local rc = options.hist and d("rom_hist", c) or d("rom", c) or c
		local rc_sp = d("rom_sp", c)
		local i_last = getlast()
		
		if options.keep_dot and c == "." then
			rc = "."
		elseif c:match("%a") then
			rc_sp = "stop"
		end
		
		local repl_digraph = d("digraph", c, result[i_last])
		if repl_digraph then
			result[i_last], rc = repl_digraph, ""
			result_sp[i_last], rc_sp = nil, nil
		end
		
		if not options.hist then --はへ
			if d("flag_hahe", result_sp[i_last]) and (umatch(c, "[-~%.゙゚]") or rc:match("[-~%a" .. glottal .. "]")) then
				result[i_last] = result_sp[i_last]
				result_sp[i_last] = nil
			end
			if d("flag_hahe", rc_sp) and (options.phonetic or result_sp[getlast(nil, function(i)
				return result[i]:len() > 0 and result_sp[i] ~= "'" or result_sp[i] == "stop"
			end, function() return false end)] == "stop" or result[i_last]:match("[-~%a" .. glottal .. "]")) then
				rc = rc_sp
				rc_sp = nil
			end
		end
		
		if rc:match("%a") and umatch(result[i_last], "^[,%.?!:)Ӡ]$") then --space and punctuations
			result[i_last] = result[i_last] .. " "
		elseif umatch(rc, "^[(“]$") and result[i_last]:match("%a") then
			rc = " " .. rc
		end
		
		if rc_sp == "voiced" then -- voicing
			result[i_last] = do_voicing(i_last, result, result_sp, options.hist, d, "tr_voicing")
		elseif rc_sp == "semivoiced" then
			result[i_last] = do_voicing(i_last, result, result_sp, options.hist, d, "tr_semivoicing")
		end
		
		if rc:match("[" .. cons .. "]+" .. "$") and rc_sp ~= "stop" then
			rc_sp = "coda"
		end
		
		local r_last = result[i_last]
		local r_lastlast = r_last:match"^.*(%a%A*)$" --vowel clusters or stop consonants
		if r_lastlast and r_lastlast:match("[aiueo]") then
			if rc:match("^%-[yw]") and r_last:match("^[" .. cons .. "yw]") then
				local rc_first = rc:sub(2, 2)
				r_last = #r_last > 1 and r_last:sub(1, -2) or r_last
				if not (rc_first == "y" and d("flag_postalveolarconsonant", r_last)) then
					r_last = r_last .. rc_first
				end
				result[i_last] = r_last
				rc = rc:sub(3)
			elseif options.hist and r_last:match("^[" .. cons .. "]") and (
				r_lastlast == "i" and rc:sub(1, 1) == "y" or
				r_lastlast == "u" and rc:sub(1, 1) == "w"
			) then
				local rc_first = rc:sub(1, 1)
				r_last = r_last:sub(1, -2)
				if not (rc_first == "y" and d("flag_postalveolarconsonant", r_last)) then
					r_last = r_last .. rc_first
				end
				result[i_last] = r_last
				rc = rc:sub(2)
			elseif rc:match"^%-[yw]?[aiueo]$" then
				rc = rc:sub(2)
				if r_lastlast == rc then
					result[i_last] = r_last .. r_lastlast
					rc = ""
				elseif d("flag_specialconsonant", r_last) then
					result[i_last] = r_last:sub(1, -2)
				elseif r_lastlast == "i" then
					result[i_last] = r_last:sub(1, -2) .. "y"
				elseif r_lastlast:match("[ou]") and rc ~= "u" then
					result[i_last] = r_last:sub(1, -2) .. "w"
				elseif #r_last > 1 then
					result[i_last] = r_last:sub(1, -2)
				end
			end
		end
		
		insert(result, rc)
		result_sp[#result] = rc_sp
	end
	
	if not options.hist then --isolated はへ
		local i_last = getlast()
		if d("flag_hahe", result_sp[i_last]) and getlast(i_last - 1) == 0 then
			result[i_last] = result_sp[i_last]
		end
	end
	
	local has_gem = false
	for i, v in ipairs(result) do
		--gemination
		if has_gem then
			local apos, consonant, remainder = v:match("^(" .. glottal .. "*)([" .. cons .. "yw]+)(.*)")
			if consonant then
				local init, c_gem = apos .. consonant
				while true do
					c_gem = d("tr_gem", init)
					if #init == 1 or not init:match("[yw]$") then
						break
					end
					init = init:sub(1, -2)
				end
				c_gem = c_gem or init:sub(1, 1)
				v = consonant .. remainder
				local i_gem = getlast(i)
				while true do
					i_gem = getlast(i_gem - 1)
					if result_sp[i_gem] == "gem" then
						result[i_gem] = c_gem
					elseif result_sp[i_gem] ~= "allow gem" then
						i_gem = getlast(i_gem + 1)
						result[i_gem] = apos .. result[i_gem]
						break
					end
				end
				has_gem = false
			end
		elseif result_sp[i] == "gem" then
			has_gem = true
		end
		
		-- FIXME: ng/nw should be determined automatically by a disambiguation model.
		local v_first = v:match("^[aiueoyw]") or v:match("^n[gw]")
		if v_first then
			local i_last
			if v_first == "y" or v_first == "w" or v_first == "ng" or v_first == "nw" then
				i_last = getlast(i - 1, function(index)
					local res, res_sp = result[index], result_sp[index]
					return res ~= "" and res ~= "." and res_sp ~= "'" and res_sp ~= "gem"
				end, function() end)
			else
				i_last = getlast(i - 1, nil, function() end)
			end
			if v_first:sub(1, 1) == "n" then
				if umatch(result[i_last], "%a") and not (v_first == "nw" and result[i_last]:match("n$")) then
					v = disambig .. v
				end
			elseif result_sp[i_last] == "coda" then
				local coda = d("tr_coda_apos", v_first, result[i_last])
				if coda == nil or options.hist and coda == "hist" then
					v = disambig .. v
				end
			end
		end
		
		--Diacritics (long vowels and others).
		v = v:gsub("[aiueo][aiueo%A]*", d("tr_long")) -- From small kana.
		local i_last = getlast(i - 1)
		local r_last = result[i_last]
		-- From digraphs.
		if r_last and not (options.hist or options.phonetic or options.no_diacritics) then
			local r_lastlast = r_last:match"^.*(%a%A*)$" --vowel clusters or stop consonants
			if r_lastlast and d("tr_long", r_lastlast .. v) and not r_last:match("[aiueo][aiueo]$") then
				result[i_last] = (r_last .. v):gsub("[aiueo][aiueo%A]*", d("tr_long"))
				v = ""
			end
		end
		
		result[i] = v
	end
	
	local num_cap = 0
	for i, v in ipairs(result) do
		--uppercase
		if result_sp[i] == "cap" then
			num_cap = num_cap + 1
		end
		if num_cap > 0 then
			result[i] = v:gsub(".[\128-\191]*", function(c)
				if num_cap <= 0 then return c end
				local uc = c:uupper()
				if c ~= uc then num_cap = num_cap - 1 end
				return uc
			end)
		end
	end
	
	return (concat(result):gsub("[" .. glottal .. disambig .. "]", "'"))
end

return export