Module:Unicode data/testcases

From The Pinched Universe
Revision as of 22:42, 24 June 2024 by Hori (talk | contribs) (Copied from https://en.wikipedia.org/w/index.php?title=Module:Unicode_data/testcases)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

The documentation for this module is not intended to be hosted on this wiki.

However, you might be able to find it at one of the following locations:

local p = require "Module:UnitTests"
local Unicode_data = require(mw.title.getCurrentTitle().subpageText == "sandbox"
	and "Module:Unicode data/sandbox" or "Module:Unicode data")

local U = mw.ustring.char
local get_codepoint = mw.ustring.codepoint
local function show(codepoint)
	if Unicode_data.is_printable(codepoint) then
		local printed_codepoint = U(codepoint)
		if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then
			printed_codepoint = ("&#x%X;"):format(codepoint)
		end
		if Unicode_data.is_combining(codepoint) then
			printed_codepoint = "◌" .. printed_codepoint
		end
		return ("U+%04X: %s"):format(codepoint, printed_codepoint)
	else
		return ("U+%04X"):format(codepoint)
	end
end

local function show_codepoint_and_name(codepoint)
	return ("%s (%s)"):format(show(codepoint),
		Unicode_data.lookup_name(codepoint))
end

function p:test_lookup_name()
	local examples = {
		{   0x0000, "<control-0000>" },
		{   0x007F, "<control-007F>" },
		{   0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" },
		{   0x0300, "COMBINING GRAVE ACCENT" },
		{   0x0378, "<reserved-0378>" },
		{   0x1B44, "BALINESE ADEG ADEG" },
		{   0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" },
		{   0x3555, "CJK UNIFIED IDEOGRAPH-3555" },
		{   0xAC01, "HANGUL SYLLABLE GAG" },
		{   0xD5FF, "HANGUL SYLLABLE HEH" },
		{   0xDC00, "<surrogate-DC00>", },
		{   0xEEEE, "<private-use-EEEE>" },
		{   0xFDD1, "<noncharacter-FDD1>", },
		{   0xFFFD, "REPLACEMENT CHARACTER" },
		{   0xFFFF, "<noncharacter-FFFF>" },
		{  0x1F4A9, "PILE OF POO" },
		{  0xE0000, "<reserved-E0000>" },
		{  0xF0F0F, "<private-use-F0F0F>" },
		{ 0x10FFFF, "<noncharacter-10FFFF>" },
	}
	
	self:iterate(examples,
		function (self, codepoint, name)
			self:equals(show(codepoint),
				Unicode_data.lookup_name(codepoint), name)
		end)
end

function p:test_lookup_age()
	local examples = {
		{   0x0061, "1.1" },
		{   0x0378, "NA" },
		{   0x1B44, "5.0" },
		{   0x3555, "3.0" },
		{   0xAC01, "2.0" },
		{   0xDC00, "2.0", },
		{   0xEEEE, "1.1" },
		{   0xFDD1, "3.1", },
		{  0x1F4A9, "6.0" },
		{  0xE0000, "NA" },
		{  0xF0F0F, "2.0" },
		{ 0x10FFFF, "2.0" },
	}
	
	self:iterate(examples,
		function (self, codepoint, age)
			-- Remove pcall when this function is added to [[Module:Unicode data]].
			pcall(function ()
				self:equals(show(codepoint),
					Unicode_data.lookup_age(codepoint), age)
			end)
		end)
end

function p:test_is_combining()
	local examples = {
		{ 0x0300, true },
		{ 0x0060, false },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.is_combining(codepoint),
				expected)
		end)
end

function p:test_is_default_ignorable()
	local examples = {
		{ 0x0061, false },
		{ 0x00AD, true },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			-- Remove pcall when this function is added to [[Module:Unicode data]].
			pcall(function ()
				self:equals(
					show_codepoint_and_name(codepoint),
					Unicode_data.is_default_ignorable(codepoint),
					expected)
			end)
		end)
end

function p:test_lookup_script()
	local examples = {
		{ 0x0061, "Latn" },
		{ 0x002F, "Zyyy" },
		{ 0x0300, "Zinh" },
		{ 0x0378, "Zzzz" },
		{ 0x0398, "Grek" },
		{ 0x03E2, "Copt" },
		{ 0x2014, "Zyyy" },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.lookup_script(codepoint),
				expected)
		end)
end

function p:test_lookup_category()
	local examples = {
		{ get_codepoint "\t", "Cc" },
		{ get_codepoint " ",  "Zs" },
		{ get_codepoint "[",  "Ps" },
		{ get_codepoint "]",  "Pe" },
		{ get_codepoint "^",  "Sk" },
		{ get_codepoint "A",  "Lu" },
		{ 0x00AD,             "Cf" },
		{ get_codepoint "¾",  "No" },
		{ get_codepoint "«",  "Pi" },
		{ get_codepoint "»",  "Pf" },
		{ 0x0300,             "Mn" },
		{ 0x0488,             "Me" },
		{ get_codepoint "٣",  "Nd" },
		{ get_codepoint "子", "Lo" },
		{ get_codepoint "ᾮ",  "Lt" },
		{ 0x1B44,             "Mc" },
		{ get_codepoint "∈",  "Sm" },
		{ get_codepoint "‿",  "Pc" },
		{ get_codepoint "↹",  "So" },
		{ get_codepoint "⸗",  "Pd" },
		{ get_codepoint "Ⅷ", "Nl" },
		{ 0x2028,              "Zl" },
		{ 0x2029,              "Zp" },
		{ get_codepoint "ゞ",  "Lm" },
		{ 0xD800,              "Cs" },
		{ get_codepoint "£",  "Sc" },
		{ 0xFFFF,              "Cn" },
		{ 0x100000,            "Co" },
	}
	
	self:iterate(examples,
		function (self, codepoint, expected)
			self:equals(
				show_codepoint_and_name(codepoint),
				Unicode_data.lookup_category(codepoint),
				expected)
		end)
end

local fun = require "Module:Fun"
local m_table = require "Module:TableTools"

local script_to_count_mt = {
	__index = function (self, key)
		self[key] = 0
		return 0
	end,
	__call = function (self, ...)
		return setmetatable({}, self)
	end
}
setmetatable(script_to_count_mt, script_to_count_mt)

local script_counts = setmetatable({}, {
	__index = function (self, str)
		if type(str) ~= "string" then return nil end
		
		local script_to_count = script_to_count_mt()
		
		for codepoint in mw.ustring.gcodepoint(str) do
			local script = Unicode_data.lookup_script(codepoint)
			script_to_count[script] = script_to_count[script] + 1
		end
		
		local printed = table.concat(
			fun.mapIter(
				function (count, script)
					return ("%s (%d)"):format(script, count)
				end,
				m_table.sortedPairs(
					script_to_count,
					function (script1, script2)
						return script_to_count[script1] > script_to_count[script2]
					end)),
			", ")
		
		self[str] = printed
		
		return printed
	end,
})

local script_examples = {
	-- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh
	-- characters as Latn.
	
	-- This particular example only has characters below U+0340, so
	-- lookup_script doesn't have to be called.
	{ "%!?́", nil },
	{ "’ʼ“”†‡•‰′‽⁕", nil },
	{ "col·legi", "Latn" },
	"HTML character references",
	{ "𐘀", "Lina" },
	{ "&#x10600;", "Lina" },
	{ "–", nil },
	{ "&ndash;", nil },
	-- Examples from [[Template talk:Lang#Italicisation of Halkomelem]]
	"Halkomelem",
	{ "lá:yelhp", "Latn" },
	{ "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character
	{ "hən̓q̓əmin̓əm̓", "Latn" },
	"Quotes",
	 -- [[s:it:Divina Commedia/Inferno/Canto I]]
	{
[[Tant’è amara che poco è più morte;
ma per trattar del ben ch’i’ vi trovai,
dirò de l’altre cose ch’i’ v’ ho scorte.]],
		"Latn"
		
	},
	{   -- A blessing in Navajo:
		--[[User talk:Stephen G. Brown/text8]]
[[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. 
Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní
bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo
nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]],
		"Latn"
	},
	{	-- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and
		-- breves added to mark the length of the monophthongs α, ι, υ: 
[[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος
οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε,
πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν
ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν
οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·]],
		"Grek"
	},
	{ -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]]
[[Вот если вы не согласитесь с этим последним тезисом и
ответите: «Не так» или «не всегда так», то я, пожалуй, и
ободрюсь духом насчет значения героя моего Алексея
Федоровича. Ибо не только чудак «не всегда» частность и
обособление, а напротив, бывает так, что он-то, пожалуй,
и носит в себе иной раз сердцевину целого, а остальные
люди его эпохи — все, каким-нибудь наплывным  ветром,
на время почему-то от него оторвались…]],
		"Cyrl"
	},
	{ -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]]
[[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् ।
होतारं रत्नधातमम् ॥१॥
अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत ।
स देवाँ एह वक्षति ॥२॥
अग्निना रयिमश्नवत् पोषमेव दिवेदिवे ।
यशसं वीरवत्तमम् ॥३॥
अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि ।
स इद्देवेषु गच्छति ॥४॥
अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः ।
देवो देवेभिरा गमत् ॥५॥
यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि ।
तवेत् तत् सत्यमङ्गिरः ॥६॥
उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् ।
नमो भरन्त एमसि ॥७॥
राजन्तमध्वराणां गोपामृतस्य दीदिविम् ।
वर्धमानं स्वे दमे ॥८॥
स नः पितेव सूनवेऽग्ने सूपायनो भव ।
सचस्वा नः स्वस्तये ॥९॥]],
		"Deva"
	},
}

local ends_in_punctuation = setmetatable({}, {
	__index = function (self, key)
		local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil
		self[key] = val
		return val
	end,
})
local function show_script_example(script_example)
	local separator = ": "
	
	-- If last character is punctuation, place script counts on their own line
	-- Could use Unicode_data.lookup_category, but that is more memory-intensive.
	if ends_in_punctuation[script_example] then
		separator = "<br>&bull; "
	end
	
	return script_example:gsub('\n', '<br>') .. separator
		.. script_counts[script_example]
end

function p:test_get_best_script()
	self:iterate(script_examples,
		function (self, str, expected)
			self:equals(
				show_script_example(str),
				Unicode_data.get_best_script(str),
				expected)
		end)
end

function p:test_is_Latin()
	self:iterate(script_examples,
		function (self, str, best_script, is_Latin)
			self:equals(show_script_example(str), Unicode_data.is_Latin(str),
				is_Latin or best_script == "Latn")
		end)
end

function p:test_lookup_block()
	local examples = {
		{   0x0064, "Basic Latin"                      },
		{   0x030B, "Combining Diacritical Marks"      },
		{   0x03A3, "Greek and Coptic"                 },
		{   0x0411, "Cyrillic"                         },
		{   0x10E6, "Georgian"                         },
		{   0x3175, "Hangul Compatibility Jamo"        },
		{   0xAC01, "Hangul Syllables"                 },
		{   0x4E0A, "CJK Unified Ideographs"           },
		{  0x1F608, "Emoticons"                        },
		{  0x30000, "CJK Unified Ideographs Extension G"},
		{ 0x10FFFF, "Supplementary Private Use Area-B" },
	}
	
	self:iterate(examples,
		function (self, codepoint, block_name)
			self:equals(
				show(codepoint),
				Unicode_data.lookup_block(codepoint),
				block_name)
		end)
end

function p:test_is_rtl()
	local examples = {
		{ "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name
		{ "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input
		{ "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂‬", true }, -- Syriac, sixth beatitude (Matthew 5:8)
		{ "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1
		{ "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet
		{
			-- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]])
			"𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈",
			true
		},
		{ "ދިވެހި", true }, -- the word dhivehi written in Thaana script
		{ "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]])
		{ "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]])
		{ "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]])
		{ "%$!^&", false },
	}
	
	self:iterate(examples,
		function (self, str, expected)
			self:equals(str, Unicode_data.is_rtl(str), expected)
		end)
end

-- Change function names into more readable headers for the testcases tables.
for k, v in m_table.sortedPairs(p) do
	if type(k) == "string" then
		local new_k = k:gsub("^test_(.+)$", "testcases for <code>%1</code>")
		if new_k ~= k then
			p[k] = nil
			p[new_k] = v
		end
	end
end

return p