Module:Unicode data/testcases
Jump to navigation
Jump to search
The documentation for this module is not intended to be hosted on this wiki.
However, you might be able to find it at one of the following locations:
local p = require "Module:UnitTests"
local Unicode_data = require(mw.title.getCurrentTitle().subpageText == "sandbox"
and "Module:Unicode data/sandbox" or "Module:Unicode data")
local U = mw.ustring.char
local get_codepoint = mw.ustring.codepoint
local function show(codepoint)
if Unicode_data.is_printable(codepoint) then
local printed_codepoint = U(codepoint)
if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then
printed_codepoint = ("&#x%X;"):format(codepoint)
end
if Unicode_data.is_combining(codepoint) then
printed_codepoint = "◌" .. printed_codepoint
end
return ("U+%04X: %s"):format(codepoint, printed_codepoint)
else
return ("U+%04X"):format(codepoint)
end
end
local function show_codepoint_and_name(codepoint)
return ("%s (%s)"):format(show(codepoint),
Unicode_data.lookup_name(codepoint))
end
function p:test_lookup_name()
local examples = {
{ 0x0000, "<control-0000>" },
{ 0x007F, "<control-007F>" },
{ 0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" },
{ 0x0300, "COMBINING GRAVE ACCENT" },
{ 0x0378, "<reserved-0378>" },
{ 0x1B44, "BALINESE ADEG ADEG" },
{ 0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" },
{ 0x3555, "CJK UNIFIED IDEOGRAPH-3555" },
{ 0xAC01, "HANGUL SYLLABLE GAG" },
{ 0xD5FF, "HANGUL SYLLABLE HEH" },
{ 0xDC00, "<surrogate-DC00>", },
{ 0xEEEE, "<private-use-EEEE>" },
{ 0xFDD1, "<noncharacter-FDD1>", },
{ 0xFFFD, "REPLACEMENT CHARACTER" },
{ 0xFFFF, "<noncharacter-FFFF>" },
{ 0x1F4A9, "PILE OF POO" },
{ 0xE0000, "<reserved-E0000>" },
{ 0xF0F0F, "<private-use-F0F0F>" },
{ 0x10FFFF, "<noncharacter-10FFFF>" },
}
self:iterate(examples,
function (self, codepoint, name)
self:equals(show(codepoint),
Unicode_data.lookup_name(codepoint), name)
end)
end
function p:test_lookup_age()
local examples = {
{ 0x0061, "1.1" },
{ 0x0378, "NA" },
{ 0x1B44, "5.0" },
{ 0x3555, "3.0" },
{ 0xAC01, "2.0" },
{ 0xDC00, "2.0", },
{ 0xEEEE, "1.1" },
{ 0xFDD1, "3.1", },
{ 0x1F4A9, "6.0" },
{ 0xE0000, "NA" },
{ 0xF0F0F, "2.0" },
{ 0x10FFFF, "2.0" },
}
self:iterate(examples,
function (self, codepoint, age)
-- Remove pcall when this function is added to [[Module:Unicode data]].
pcall(function ()
self:equals(show(codepoint),
Unicode_data.lookup_age(codepoint), age)
end)
end)
end
function p:test_is_combining()
local examples = {
{ 0x0300, true },
{ 0x0060, false },
}
self:iterate(examples,
function (self, codepoint, expected)
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.is_combining(codepoint),
expected)
end)
end
function p:test_is_default_ignorable()
local examples = {
{ 0x0061, false },
{ 0x00AD, true },
}
self:iterate(examples,
function (self, codepoint, expected)
-- Remove pcall when this function is added to [[Module:Unicode data]].
pcall(function ()
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.is_default_ignorable(codepoint),
expected)
end)
end)
end
function p:test_lookup_script()
local examples = {
{ 0x0061, "Latn" },
{ 0x002F, "Zyyy" },
{ 0x0300, "Zinh" },
{ 0x0378, "Zzzz" },
{ 0x0398, "Grek" },
{ 0x03E2, "Copt" },
{ 0x2014, "Zyyy" },
}
self:iterate(examples,
function (self, codepoint, expected)
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.lookup_script(codepoint),
expected)
end)
end
function p:test_lookup_category()
local examples = {
{ get_codepoint "\t", "Cc" },
{ get_codepoint " ", "Zs" },
{ get_codepoint "[", "Ps" },
{ get_codepoint "]", "Pe" },
{ get_codepoint "^", "Sk" },
{ get_codepoint "A", "Lu" },
{ 0x00AD, "Cf" },
{ get_codepoint "¾", "No" },
{ get_codepoint "«", "Pi" },
{ get_codepoint "»", "Pf" },
{ 0x0300, "Mn" },
{ 0x0488, "Me" },
{ get_codepoint "٣", "Nd" },
{ get_codepoint "子", "Lo" },
{ get_codepoint "ᾮ", "Lt" },
{ 0x1B44, "Mc" },
{ get_codepoint "∈", "Sm" },
{ get_codepoint "‿", "Pc" },
{ get_codepoint "↹", "So" },
{ get_codepoint "⸗", "Pd" },
{ get_codepoint "Ⅷ", "Nl" },
{ 0x2028, "Zl" },
{ 0x2029, "Zp" },
{ get_codepoint "ゞ", "Lm" },
{ 0xD800, "Cs" },
{ get_codepoint "£", "Sc" },
{ 0xFFFF, "Cn" },
{ 0x100000, "Co" },
}
self:iterate(examples,
function (self, codepoint, expected)
self:equals(
show_codepoint_and_name(codepoint),
Unicode_data.lookup_category(codepoint),
expected)
end)
end
local fun = require "Module:Fun"
local m_table = require "Module:TableTools"
local script_to_count_mt = {
__index = function (self, key)
self[key] = 0
return 0
end,
__call = function (self, ...)
return setmetatable({}, self)
end
}
setmetatable(script_to_count_mt, script_to_count_mt)
local script_counts = setmetatable({}, {
__index = function (self, str)
if type(str) ~= "string" then return nil end
local script_to_count = script_to_count_mt()
for codepoint in mw.ustring.gcodepoint(str) do
local script = Unicode_data.lookup_script(codepoint)
script_to_count[script] = script_to_count[script] + 1
end
local printed = table.concat(
fun.mapIter(
function (count, script)
return ("%s (%d)"):format(script, count)
end,
m_table.sortedPairs(
script_to_count,
function (script1, script2)
return script_to_count[script1] > script_to_count[script2]
end)),
", ")
self[str] = printed
return printed
end,
})
local script_examples = {
-- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh
-- characters as Latn.
-- This particular example only has characters below U+0340, so
-- lookup_script doesn't have to be called.
{ "%!?́", nil },
{ "’ʼ“”†‡•‰′‽⁕", nil },
{ "col·legi", "Latn" },
"HTML character references",
{ "𐘀", "Lina" },
{ "𐘀", "Lina" },
{ "–", nil },
{ "–", nil },
-- Examples from [[Template talk:Lang#Italicisation of Halkomelem]]
"Halkomelem",
{ "lá:yelhp", "Latn" },
{ "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character
{ "hən̓q̓əmin̓əm̓", "Latn" },
"Quotes",
-- [[s:it:Divina Commedia/Inferno/Canto I]]
{
[[Tant’è amara che poco è più morte;
ma per trattar del ben ch’i’ vi trovai,
dirò de l’altre cose ch’i’ v’ ho scorte.]],
"Latn"
},
{ -- A blessing in Navajo:
--[[User talk:Stephen G. Brown/text8]]
[[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł.
Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní
bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo
nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]],
"Latn"
},
{ -- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and
-- breves added to mark the length of the monophthongs α, ι, υ:
[[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος
οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε,
πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν
ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν
οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·]],
"Grek"
},
{ -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]]
[[Вот если вы не согласитесь с этим последним тезисом и
ответите: «Не так» или «не всегда так», то я, пожалуй, и
ободрюсь духом насчет значения героя моего Алексея
Федоровича. Ибо не только чудак «не всегда» частность и
обособление, а напротив, бывает так, что он-то, пожалуй,
и носит в себе иной раз сердцевину целого, а остальные
люди его эпохи — все, каким-нибудь наплывным ветром,
на время почему-то от него оторвались…]],
"Cyrl"
},
{ -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]]
[[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् ।
होतारं रत्नधातमम् ॥१॥
अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत ।
स देवाँ एह वक्षति ॥२॥
अग्निना रयिमश्नवत् पोषमेव दिवेदिवे ।
यशसं वीरवत्तमम् ॥३॥
अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि ।
स इद्देवेषु गच्छति ॥४॥
अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः ।
देवो देवेभिरा गमत् ॥५॥
यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि ।
तवेत् तत् सत्यमङ्गिरः ॥६॥
उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् ।
नमो भरन्त एमसि ॥७॥
राजन्तमध्वराणां गोपामृतस्य दीदिविम् ।
वर्धमानं स्वे दमे ॥८॥
स नः पितेव सूनवेऽग्ने सूपायनो भव ।
सचस्वा नः स्वस्तये ॥९॥]],
"Deva"
},
}
local ends_in_punctuation = setmetatable({}, {
__index = function (self, key)
local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil
self[key] = val
return val
end,
})
local function show_script_example(script_example)
local separator = ": "
-- If last character is punctuation, place script counts on their own line
-- Could use Unicode_data.lookup_category, but that is more memory-intensive.
if ends_in_punctuation[script_example] then
separator = "<br>• "
end
return script_example:gsub('\n', '<br>') .. separator
.. script_counts[script_example]
end
function p:test_get_best_script()
self:iterate(script_examples,
function (self, str, expected)
self:equals(
show_script_example(str),
Unicode_data.get_best_script(str),
expected)
end)
end
function p:test_is_Latin()
self:iterate(script_examples,
function (self, str, best_script, is_Latin)
self:equals(show_script_example(str), Unicode_data.is_Latin(str),
is_Latin or best_script == "Latn")
end)
end
function p:test_lookup_block()
local examples = {
{ 0x0064, "Basic Latin" },
{ 0x030B, "Combining Diacritical Marks" },
{ 0x03A3, "Greek and Coptic" },
{ 0x0411, "Cyrillic" },
{ 0x10E6, "Georgian" },
{ 0x3175, "Hangul Compatibility Jamo" },
{ 0xAC01, "Hangul Syllables" },
{ 0x4E0A, "CJK Unified Ideographs" },
{ 0x1F608, "Emoticons" },
{ 0x30000, "CJK Unified Ideographs Extension G"},
{ 0x10FFFF, "Supplementary Private Use Area-B" },
}
self:iterate(examples,
function (self, codepoint, block_name)
self:equals(
show(codepoint),
Unicode_data.lookup_block(codepoint),
block_name)
end)
end
function p:test_is_rtl()
local examples = {
{ "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name
{ "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input
{ "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂", true }, -- Syriac, sixth beatitude (Matthew 5:8)
{ "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1
{ "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet
{
-- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]])
"𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈",
true
},
{ "ދިވެހި", true }, -- the word dhivehi written in Thaana script
{ "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]])
{ "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]])
{ "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]])
{ "%$!^&", false },
}
self:iterate(examples,
function (self, str, expected)
self:equals(str, Unicode_data.is_rtl(str), expected)
end)
end
-- Change function names into more readable headers for the testcases tables.
for k, v in m_table.sortedPairs(p) do
if type(k) == "string" then
local new_k = k:gsub("^test_(.+)$", "testcases for <code>%1</code>")
if new_k ~= k then
p[k] = nil
p[new_k] = v
end
end
end
return p