Documentation for this module may be created at વિભાગ:grc-accent/doc

local export = {}

local m_data = mw.loadData("Module:grc-utilities/data")
local m_table = require("Module:table")

local diacritics_list = m_data.diacritics
local ACUTE = diacritics_list.acute
local GRAVE = diacritics_list.grave
local CIRCUMFLEX = diacritics_list.circum
local DIAERESIS = diacritics_list.diaeresis
local SMOOTH = diacritics_list.smooth
local ROUGH = diacritics_list.rough
local MACRON = diacritics_list.macron
local BREVE = diacritics_list.breve
local SUBSCRIPT = diacritics_list.subscript

local diacritic_pattern = m_data.all
local diacritic_groups = m_data.diacritic_groups
local tonal_diacritic = diacritic_groups[3] -- acute, grave, circumflex
local long_diacritics = MACRON .. SUBSCRIPT .. CIRCUMFLEX

local either_vowel = "[ΑαΙιΥυ]"

local find = mw.ustring.find
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local lower = mw.ustring.lower
local decompose = mw.ustring.toNFD

local tokenize = require('Module:grc-utilities').tokenize
local copy = m_table.shallowcopy

local function if_not_empty(var)
	if var == "" then
		return nil
	else
		return var
	end
end

local function contains_vowel(token)
	return find(token, '[ΑΕΗΙΟΥΩαεηιουω]')
end

export.contains_vowel = contains_vowel

local function is_diphthong(token)
	if find(token, "[ΑαΕεΗηΙιΟοΥυΩω][ΙιΥυ]") then
		return true
	else
		return false
	end
end

local libraryUtil = require('libraryUtil')
local checkType = libraryUtil.checkType
local checkTypeMulti = libraryUtil.checkTypeMulti

local function _check(funcName, expectType)
	if type(expectType) == "string" then
		return function(argIndex, arg, nilOk)
			checkType(funcName, argIndex, arg, expectType, nilOk)
		end
	else
		return function(argIndex, arg, expectType, nilOk)
			if type(expectType) == "table" then
				checkTypeMulti(funcName, argIndex, arg, expectType, nilOk)
			else
				checkType(funcName, argIndex, arg, expectType, nilOk)
			end
		end
	end
end

--[[
	A vowel with a breve or a lone epsilon or omicron is considered short.
	Everything else is considered long, including unmarked alphas, iotas, and
	upsilons. Sigh.
]]
local function is_short(token)
	if find(token, BREVE) or find(token, '[ΕΟεο]') and not find(token, '[ιυ]') then
		return true
	else
		return false
	end
end

local function conditional_gsub(...)
	local str, count = gsub(...)
	if count and count > 0 then
		return str
	else
		return nil
	end
end

local accent_adding_functions = {
	-- This will not throw an error if η or ω has a macron on it.
	[CIRCUMFLEX] = function(vowel)
		return (gsub(
			vowel,
			"([ΑαΗηΙιΥυΩω])" .. MACRON .. "?(" .. diacritic_groups[2] .. "?)(" .. SUBSCRIPT .. "?)$",
			"%1%2" .. CIRCUMFLEX .. "%3"
		))
	end,
	[ACUTE] = function(vowel)
		return (
			conditional_gsub(vowel,
				"([Εε])([Ωω])",
				"%1" .. ACUTE .. "%2") or
			gsub(vowel,
				"([ΑαΕεΗηΙιΟοΥυΩω]" .. diacritic_groups[1] .. "?" .. diacritic_groups[2] .. "?)(" .. SUBSCRIPT .. "?)$",
				"%1" .. ACUTE .. "%2"))
	end,
	[MACRON] = function(vowel)
		if find(vowel, "[" .. long_diacritics .. "]") or is_diphthong(vowel) then
			return vowel
		elseif find(vowel, "[ΕΟεο]") then
			error("The vowel " .. vowel ..
					" is short, so a macron cannot be added to it.")
		else
			return (gsub(vowel, "(" .. either_vowel .. ")", "%1" .. MACRON))
		end
	end,
	[BREVE] = function(vowel)
		if find(vowel, "[" .. long_diacritics .. "]") then
			error("The vowel " .. vowel ..
					" has a iota subscript, a macron, or a circumflex, so a breve cannot be added to it.")
		elseif is_diphthong(vowel) then
			error("The vowel " .. vowel ..
					" is a diphthong, so a breve cannot be added to it.")
		else
			return (gsub(vowel, "(" .. either_vowel .. ")", "%1" .. BREVE))
		end
	end,
	-- This will insert a diaeresis on a single iota or upsilon, or on a
	-- iota or upsilon that is the second element of a diphthong.
	-- It does nothing if the vowel has a breathing on it.
	[DIAERESIS] = function(vowel)
		return (gsub(
			vowel,
			"([ΙιΥυ]" .. diacritic_groups[1] .. "?)(" .. tonal_diacritic .. "?)$",
			"%1" .. DIAERESIS .. "%2"
		))
	end
}

-- Assumes decomposed vowels (NFD).
local function add(vowel, accent)
	if type(accent_adding_functions[accent]) == "function" then
		return accent_adding_functions[accent](vowel)
	else
		local name = m_table.keyFor(diacritics_list, accent)
		if name == "circum" then
			name = "circumflex"
		end
		error("No function for adding a " .. name .. ".")
	end
end

function export.strip_accent(word)
	word = decompose(word)
	-- Parentheses suppress second return value of gsub, the number of substitutions.
	return (gsub(word, diacritic_pattern, ''))
end

function export.strip_tone(word)
	word = decompose(word)
	if find(word, CIRCUMFLEX) then
		word = copy(tokenize(word))
		for i = 1, #word do
			-- Add a macron to every vowel with a circumflex and remove the circumflex.
			word[i] = gsub(word[i],
				'^([αΑιΙυΥ])([' .. SMOOTH .. ROUGH .. DIAERESIS .. ']*)' .. CIRCUMFLEX .. '$',
				'%1' .. MACRON .. '%2')
		end
		word = table.concat(word)
	end
	return (gsub(word, tonal_diacritic, ''))
end

function export.ult(word)
	word = decompose(word)
	if find(word, tonal_diacritic) then return word end
	
	word = copy(tokenize(word))
	for i, token in m_table.reverseIpairs(word) do
		if contains_vowel(token) then
			--fortunately accents go last in combining order
			word[i] = add(token, ACUTE)
			break
		end
	end
	return table.concat(word, '')
end

--[[ WARNING: Given an unmarked α ι υ, this function will return a circmflex.
That said, if you ran into this situation in the first place, you probably
are doing something wrong. ]] --
function export.circ(word)
	word = decompose(word)
	if find(word, tonal_diacritic) then return word end
	
	word = copy(tokenize(word))
	for i, token in m_table.reverseIpairs(word) do
		if contains_vowel(token) then
			if is_short(token) then
				word[i] = add(token, ACUTE)
			else
				word[i] = add(token, CIRCUMFLEX)
			end
			break
		end
	end
	return table.concat(word, '')
end

function export.penult(orig)
	local word = decompose(orig)
	if find(word, tonal_diacritic) then return word end
	
	word = copy(tokenize(word))
	local syllables = 0
	for i, token in m_table.reverseIpairs(word) do
		if token == '-' then
			return orig
		elseif contains_vowel(token) then
			syllables = syllables + 1
			if syllables == 2 then
				word[i] = add(token, ACUTE)
				return table.concat(word, '')
			end
		end
	end
	
	return export.circ(orig)
end

function export.pencirc(orig)
	local word = decompose(orig)
	if find(word, tonal_diacritic) then return word end
	
	word = copy(tokenize(word))
	local syllables = 0
	local long_ult = false
	for i, token in m_table.reverseIpairs(word) do
		if token == '-' then return orig end
		if contains_vowel(token) then
			syllables = syllables + 1
			if syllables == 1 and not is_short(token) then
				long_ult = true
				if word[#word] == 'αι' or word[#word] == 'οι' then long_ult = false end
			elseif syllables == 2 then
				if is_short(token) or long_ult then
					word[i] = add(token, ACUTE)
				else
					word[i] = add(token, CIRCUMFLEX)
				end
				return table.concat(word, '')
			end
		end
	end
	
	return export.circ(orig)
end

function export.antepenult(orig)
	local word = decompose(orig)
	if find(word, tonal_diacritic) then return word end
	
	word = copy(tokenize(word))
	local syllables = 0
	local long_ult = false
	for i, token in m_table.reverseIpairs(word) do
		if token == '-' then return orig end
		if contains_vowel(token) then
			syllables = syllables + 1
			if syllables == 1 and not is_short(token) then
				long_ult = true
				if word[#word] == 'αι' or word[#word] == 'οι' then long_ult = false end
			elseif syllables == 2 and long_ult then
				word[i] = add(token, ACUTE)
				return table.concat(word, '')
			elseif syllables == 3 then
				word[i] = add(token, ACUTE)
				return table.concat(word, '')
			end
		end
	end
	
	return export.pencirc(orig)
end

--[[
	Counts from the beginning or end of the word, and returns the position and
	type of the first accent found. Position means the number of vowels
	(syllables) that have been encountered, not the number of characters.
	
	Arguments:
	- word:			string	(Ancient Greek word)
	- from_end:		boolean	(whether to count from the end of the word)
]]
local accent_cache = { [true] = {}, [false] = {} }

function export.detect_accent(word, from_end)
	local check = _check("detect_accent")
	check(1, word, "string")
	check(2, from_end, "boolean", true)
	
	local cache = accent_cache[from_end == true][decompose(word)]
	if cache then
		return unpack(cache)
	end
	
	local names = {
		[ACUTE] 		= "acute",
		[GRAVE] 		= "grave",
		[CIRCUMFLEX]	= "circumflex",
	}
	
	local syllable = 0
	local accent_name
	
	for _, token in
			(from_end and m_table.reverseIpairs or ipairs)(tokenize(word))
			do
		if contains_vowel(token) then
			syllable = syllable + 1
			
			accent_name = names[match(token, tonal_diacritic)]
			if accent_name then
				accent_cache[from_end == true][decompose(word)] = { syllable, accent_name }
				return syllable, accent_name
			end
		end
	end
	
	return nil
end

--[[
	Returns classification based on first accent found
	when traveling back from the end of the word.
]]
function export.get_accent_term(word)
	local syllable, accent_name = export.detect_accent(word, true)
	
	local terms = {
		["grave"]		= { "barytone" },
		["acute"] 		= { "oxytone", "paroxytone", "proparoxytone" },
		["circumflex"]	= { "perispomenon", "properispomenon" },
	}
	
	local ordinals = { "first", "second", "third", "fourth", "fifth", }
	
	local term
	if syllable and accent_name then
		term = terms[accent_name][syllable]
	end
	
	if term then
		return term
	else
		return nil,
			syllable and 'There is no term for a word with a ' .. accent_name ..
				' accent on the ' .. ordinals[syllable] ..
				' syllable from the end of the word.'
			or 'No accent found.'
	end
end

-- is_noun is a boolean or nil; if it is true, αι and οι will be
-- treated as short.
function export.get_length(token, short_diphthong)
	local token = lower(token)
	-- not needed at the moment
	-- token = decompose(token)
	
	if not contains_vowel(token) then
		return nil
		-- error("The thing supplied to get_length does not have any vowels")
	end
	
	-- η, ω; ᾳ, ῃ, ῳ; ᾱ, ῑ, ῡ; diphthongs
	if find(token, "[ηω" .. long_diacritics .. "]") then
		return "long"
	end
	
	if short_diphthong and find(token, "^[αο]ι") then
		return "short"
	end
	
	if is_diphthong(token) then
		return "long"
	end
	
	-- ε, ο; ᾰ, ῐ, ῠ
	if find(token, "[εο" .. BREVE .. "]") then
		return "short"
	end
	
	-- anything else
	return "either"
end

-- Takes a table of tokens and returns a table containing tables of each vowel's
-- characteristics.
function export.get_vowel_info(tokens, short_diphthong)
	if type(tokens) ~= "table" then
		error("The argument to get_vowel_info must be a table.")
	end
	
	local vowels = {}
	local vowel_i = 1
	if find(tokens[#tokens], m_data.consonant .. "$") then
		short_diphthong = false
	end
	
	for i, token in m_table.reverseIpairs(tokens) do
		if contains_vowel(token) then
			if vowel_i ~= 1 then
				short_diphthong = false
			end
			local length, accent =
				export.get_length(token, short_diphthong),
				if_not_empty(match(token,
					"[" .. ACUTE .. GRAVE .. CIRCUMFLEX .. "]"))
			vowels[vowel_i] = {
					index = i,
					length = length,
					accent = accent,
			}
			vowel_i = vowel_i + 1
		end
	end
	
	return vowels
end

function export.mark_implied_length(word, return_tokens, short_diphthong)
	word = decompose(word)
	-- Do nothing if there are no vowel letters that could be ambiguous.
	if not find(word, either_vowel) then
		if return_tokens then
			return tokenize(word)
		else
			return word
		end
	end
	
	local tokens = copy(tokenize(word))
	local vowels = export.get_vowel_info(tokens, short_diphthong)
	
	if #vowels >= 2 then
		local ultima = vowels[1]
		local ultima_i = ultima.index
		
		local penult = vowels[2]
		local penult_i = penult.index
		
		if penult.length == "either" and ultima.length == "short" then
			if penult.accent == CIRCUMFLEX then
				tokens[penult_i] = add(tokens[penult_i], MACRON)
			elseif penult.accent == ACUTE then
				tokens[penult_i] = add(tokens[penult_i], BREVE)
			end
		elseif penult.length == "long" and ultima.length == "either" then
			if penult.accent == CIRCUMFLEX then
				tokens[ultima_i] = add(tokens[ultima_i], BREVE)
			elseif penult.accent == ACUTE then
				tokens[ultima_i] = add(tokens[ultima_i], MACRON)
			end
		end
		
		local antepenult = vowels[3]
		if antepenult and antepenult.accent and ultima.length == "either" then
			tokens[ultima_i] = add(tokens[ultima_i], BREVE)
		end
	end
	
	if return_tokens then
		return tokens
	else
		return table.concat(tokens)
	end
end

-- Returns the length of a syllable specified by its position from the end of the word.
function export.length_at(word, syllable)
	local tokens = tokenize(word)
	
	if type(word) ~= "string" then
		error("First argument of length_at should be a string.")
	end
	
	if type(syllable) ~= "number" then
		error("Second argument of length_at should be a number.")
	end
	
	local syllable_count = 0
	for _, token in m_table.reverseIpairs(tokens) do
		local length = export.get_length(token)
		if length then
			syllable_count = syllable_count + 1
			if syllable_count == syllable then
				return length
			end
		end
	end
	
	if syllable_count < syllable then
		error("Length for syllable " .. syllable .. " from the end of the word was not found.")
	end
end

local function find_breathing(token)
	return match(token, "([" .. ROUGH .. SMOOTH .. "])")
end

local function has_same_breathing_as(token1, token2)
	return find_breathing(token1) == find_breathing(token2)
end

-- Make token have the length specified by the string "length".
local function change_length(length, token)
	local diacritic
	if length == "long" then
		diacritic = MACRON
	elseif length == "short" then
		diacritic = BREVE
	end
	
	if diacritic then
		return add(token, diacritic)
	else
		return token
	end
end

--[[
	Take two words, mark implied length on each, then harmonize any macrons and
	breves that disagree.
]]
function export.harmonize_length(word1, word2)
	word1 = decompose(word1)
	-- Do nothing if there are no vowel letters that could be ambiguous.
	if not (find(word1, either_vowel) or find(word2, either_vowel)) then
		return word1, word2
	end
	
	local tokens1, tokens2 = export.mark_implied_length(word1, true), export.mark_implied_length(word2, true)
	local strip1, strip2 = copy(tokenize(export.strip_accent(word1))), copy(tokenize(export.strip_accent(word2)))
	
	for i, token1 in pairs(tokens1) do
		local token2 = tokens2[i]
		
		if strip1[i] == strip2[i] then
			if has_same_breathing_as(token1, token2) then
				local length1, length2 = export.get_length(token1), export.get_length(token2)
				if length1 and length2 and length1 ~= length2 then
						if length1 == "either" then
							tokens1[i] = change_length(length2, token1)
						elseif length2 == "either" then
							tokens2[i] = change_length(length1, token2)
						end
				end
			else
				break
			end
		else
			break
		end
	end
	
	local new_word1, new_word2 = table.concat(tokens1), table.concat(tokens2)
	
	return new_word1, new_word2
end

--[[
	Get weight of nth syllable from end of word. Position defaults to 1, the last
	syllable. Returns "heavy" or "light", or nil if syllable is open with an
	ambiguous vowel.
]]
function export.get_weight(word, position)
	if not if_not_empty(word) then
		return nil
	end
	local tokens = tokenize(word)
	
	if not position then
		position = 1
	end
	
	local vowel
	local vowel_index = 0
	
	-- Find nth vowel from end of word.
	for i, token in m_table.reverseIpairs(tokens) do
		local length = export.get_length(token)
		if length then
			vowel_index = vowel_index + 1
			if vowel_index == position then
				vowel = { index = i, length = length }
				break
			end
		end
	end
	
	if not vowel then
		return nil
	end
	
	if vowel.length == "long" then
		return "heavy"
	else
		-- Count consonants after the vowel.
		local consonant_count = 0
		
		for i = vowel.index + 1, #tokens do
			if not contains_vowel(tokens[i]) then
				consonant_count = consonant_count + 1
			else
				break
			end
		end
		
		if consonant_count > 1 then
			return "heavy"
		elseif vowel.length == "short" then
			return "light"
		else
			return nil
		end
	end
end

--[[
	Add accent mark at position. Position is a number that refers to the nth
	vowel from the beginning of the word. Respects the rules of accent.
	Examples:
	- δημος,	1		=> δῆμος
	- προτερᾱ,	1		=> προτέρᾱ	(position changed to 2 because ultima is long)
	- μοιρα,	1, true	=> μοῖρα	(circumflex can be added because ultima is
										ambiguous)
	- χωρᾱ,		1, true	=> χώρᾱ		(circumflex can't be added because ultima
										is long)
	- τοιουτος,	2		=> τοιοῦτος	(circumflex because ultima is short)
	
	Arguments:
	- word:					string	(hopefully an Ancient Greek word or stem)
	- syllable_position:	number	(less than the number of monophthongs or diphthongs
										in the word)
	- options:				table
		- circumflex		boolean		(add a circumflex if allowed)
		- synaeresis		boolean		(accent can fall before εω in penult
											and ultima: πόλεως)
		- short_diphthong	boolean		(word-final οι, αι count as short)
]]
function export.add_accent(word, syllable_position, options)
	local check = _check("add_accent")
	check(1, word, "string")
	check(2, syllable_position, "number")
	check(3, options, "table", true)
	
	word = decompose(word)
	if find(word, tonal_diacritic) then
		return word
	end
	
	options = options or {}
	
	local tokens = copy(tokenize(word))
	local vowels = export.get_vowel_info(tokens, options.short_diphthong)
	local vowel_count = #vowels
	
	-- Convert positions in relation to the beginning of the word
	-- to positions in relation to the end of the word.
	-- The farthest back that an accent can be placed is 3 (the antepenult),
	-- so that is the greatest allowed position.
	if syllable_position > 0 then
		syllable_position = math.min(3, vowel_count - syllable_position + 1)
	-- If the position is in relation to the end of the word and it is greater
	-- than the length of the word, then reduce it to the length of the word.
	-- This is for practical reasons. Positions in relation to the beginning of
	-- the word do not need leeway.
	elseif syllable_position < 0 then
		syllable_position = math.min(-syllable_position, vowel_count)
	end
	
	if syllable_position == 0 then
		error("Invalid position value " .. syllable_position .. ".")
	elseif syllable_position > vowel_count then
		error("The position " .. syllable_position .. " is invalid, because the word has only " .. vowel_count .. " vowels.")
	end
	
	-- Apply accent rules to change the accent's position or type.
	local accent_mark = options.circumflex and CIRCUMFLEX or ACUTE
	local ultima = vowels[1]
	
	-- If synaeresis is selected, a final vowel sequence εω (optionally
	-- separated by an undertie) counts as one syllable.
	if syllable_position == 3 then
		local penult = vowels[2]
		if not options.force_antepenult and (ultima.length == "long"
				and not (options.synaeresis
				and ("Ωω"):find(tokens[ultima.index], 1, true)
				and ("Εε"):find(tokens[penult.index], 1, true)
				and (ultima.index == penult.index + 1
				or ultima.index == penult.index + 2
				and tokens[penult.index + 1] == mw.ustring.char(0x035C)))) then
			syllable_position = 2
		else
			accent_mark = ACUTE
		end
	end
	
	if syllable_position == 2 then
		if ultima.length == "short" and vowels[2].length == "long"  then
			accent_mark = CIRCUMFLEX
		elseif ultima.length == "long" then
			accent_mark = ACUTE
		end
	end
	
	local vowel = vowels[syllable_position]
	if not vowel then
		error('No vowel at position ' .. syllable_position ..
			' from the end of the word ' .. word .. '.')
	end
	if vowel.length == "short" then
		accent_mark = ACUTE
	end
	
	local i = vowel.index
	tokens[i] = add(tokens[i], accent_mark)
	
	return table.concat(tokens)
end

function export.syllables(word, func, number)
	local check = _check('syllables')
	check(1, word, 'string')
	check(2, func, 'string', true)
	check(3, number, 'number', true)
	
	if not func then
		error('No function specified')
	end
	
	local functions = {
		eq = function (word, number)
			local vowels = 0
			for _, token in ipairs(tokenize(word)) do
				if contains_vowel(token) then
					vowels = vowels + 1
					if vowels > number then
						return false
					end
				end
			end
			if vowels == number then
				return true
			end
			return false
		end
	}
	
	func = functions[func]
	if func then
		return func(word, number)
	else
		error('No function ' .. func)
	end
end

return export