Module:Text

From The Pinched Universe
Jump to navigation Jump to search

The documentation for this module is not intended to be hosted on this wiki.

However, you might be able to find it at one of the following locations:

local Text = { serial = "2017-11-01",
               suite  = "Text" }
--[=[
Text utilities
]=]



-- local globals
local PatternCJK        = false
local PatternCombined   = false
local PatternLatin      = false
local PatternTerminated = false
local QuoteLang         = false
local QuoteType         = false
local RangesLatin       = false
local SeekQuote         = false



local function factoryQuote()
    -- Create quote definitions
    QuoteLang = { af        = "bd",
                  ar        = "la",
                  be        = "labd",
                  bg        = "bd",
                  ca        = "la",
                  cs        = "bd",
                  da        = "bd",
                  de        = "bd",
                  dsb       = "bd",
                  et        = "bd",
                  el        = "lald",
                  en        = "ld",
                  es        = "la",
                  eu        = "la",
            --    fa        = "la",
                  fi        = "rd",
                  fr        = "laSPC",
                  ga        = "ld",
                  he        = "ldla",
                  hr        = "bd",
                  hsb       = "bd",
                  hu        = "bd",
                  hy        = "labd",
                  id        = "rd",
                  is        = "bd",
                  it        = "ld",
                  ja        = "x300C",
                  ka        = "bd",
                  ko        = "ld",
                  lt        = "bd",
                  lv        = "bd",
                  nl        = "ld",
                  nn        = "la",
                  no        = "la",
                  pl        = "bdla",
                  pt        = "lald",
                  ro        = "bdla",
                  ru        = "labd",
                  sk        = "bd",
                  sl        = "bd",
                  sq        = "la",
                  sr        = "bx",
                  sv        = "rd",
                  th        = "ld",
                  tr        = "ld",
                  uk        = "la",
                  zh        = "ld",
                  ["de-ch"] = "la",
                  ["en-gb"] = "lsld",
                  ["en-us"] = "ld",
                  ["fr-ch"] = "la",
                  ["it-ch"] = "la",
                  ["pt-br"] = "ldla",
                  ["zh-tw"] = "x300C",
                  ["zh-cn"] = "ld" }
    QuoteType = { bd    = { { 8222, 8220 },  { 8218, 8217 } },
                  bdla  = { { 8222, 8220 },  {  171,  187 } },
                  bx    = { { 8222, 8221 },  { 8218, 8217 } },
                  la    = { {  171,  187 },  { 8249, 8250 } },
                  laSPC = { {  171,  187 },  { 8249, 8250 },  true },
                  labd  = { {  171,  187 },  { 8222, 8220 } },
                  lald  = { {  171,  187 },  { 8220, 8221 } },
                  ld    = { { 8220, 8221 },  { 8216, 8217 } },
                  ldla  = { { 8220, 8221 },  {  171,  187 } },
                  lsld  = { { 8216, 8217 },  { 8220, 8221 } },
                  rd    = { { 8221, 8221 },  { 8217, 8217 } },
                  x300C = { { 0x300C, 0x300D },
                            { 0x300E, 0x300F } } }
    return r
end -- factoryQuote()



local function fiatQuote( apply, alien, advance )
    -- Quote text
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code
    --     advance  -- number, with level 1 or 2
    local r = apply
    local suite
    if not QuoteLang then
        factoryQuote()
    end
    suite = QuoteLang[ alien ]
    if not suite then
        local slang = alien:match( "^(%l+)-" )
        if slang then
            suite = QuoteLang[ slang ]
        end
        if not suite then
            suite = QuoteLang[ "en" ]
        end
    end
    if suite then
        local quotes = QuoteType[ suite ]
        if quotes then
            local space
            if quotes[ 3 ] then
                space = " "
            else
                space = ""
            end
            quotes = quotes[ advance ]
            if quotes then
                r = mw.ustring.format( "%s%s%s%s%s",
                                       mw.ustring.char( quotes[ 1 ] ),
                                       space,
                                       apply,
                                       space,
                                       mw.ustring.char( quotes[ 2 ] ) )
            end
        else
            mw.log( "fiatQuote() " .. suite )
        end
    end
    return r
end -- fiatQuote()



Text.char = function ( apply, again, accept )
    -- Create string from codepoints
    -- Parameter:
    --     apply   -- table (sequence) with numerical codepoints, or nil
    --     again   -- number of repetitions, or nil
    --     accept  -- true, if no error messages to be appended
    -- Returns: string
    local r
    if type( apply ) == "table" then
        local bad   = { }
        local codes = { }
        local s
        for k, v in pairs( apply ) do
            s = type( v )
            if s == "number" then
                if v < 32  and  v ~= 9  and  v ~= 10 then
                    v = tostring( v )
                else
                    v = math.floor( v )
                    s = false
                end
            elseif s ~= "string" then
                v = tostring( v )
            end
            if s then
                table.insert( bad, v )
            else
                table.insert( codes, v )
            end
        end -- for k, v
        if #bad == 0 then
            if #codes > 0 then
                r = mw.ustring.char( unpack( codes ) )
                if again then
                    if type( again ) == "number" then
                        local n = math.floor( again )
                        if n > 1 then
                            r = r:rep( n )
                        elseif n < 1 then
                            r = ""
                        end
                    else
                        s = "bad repetitions: " .. tostring( again )
                    end
                end
            end
        else
            s = "bad codepoints: " .. table.concat( bad, " " )
        end
        if s  and  not accept then
            r = tostring(  mw.html.create( "span" )
                                  :addClass( "error" )
                                  :wikitext( s ) )
        end
    end
    return r or ""
end -- Text.char()



Text.concatParams = function ( args, apply, adapt )
    -- Concat list items into one string
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     apply  -- string (optional); separator (default: "|")
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    local collect = { }
    args = type(args) == 'table' and args or {} -- ensure args is table
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end -- for k, v
    return table.concat( collect,  apply or "|" )
end -- Text.concatParams()



Text.containsCJK = function ( analyse )
    -- Is any CJK code within?
    -- Parameter:
    --     analyse  -- string
    -- Returns: true, if CJK detected
    analyse = analyse or ""
    if not patternCJK then
        patternCJK = mw.ustring.char( 91,
        	                            4352, 45,   4607,
        	                           11904, 45,  42191,
        	                           43072, 45,  43135,
        	                           44032, 45,  55215,
        	                           63744, 45,  64255,
        	                           65072, 45,  65103,
        	                           65381, 45,  65500,
                                      131072, 45, 196607,
                                      93 )
    end
    if mw.ustring.find( analyse, patternCJK ) then
    	return true
    end
    return false
end -- Text.containsCJK()

Text.removeDelimited = function (s, prefix, suffix)
	-- Remove all text in s delimited by prefix and suffix (inclusive)
	-- Arguments:
	--    s = string to process
	--    prefix = initial delimiter
	--    suffix = ending delimiter
	-- Returns: stripped string
	local prefixLen = mw.ustring.len(prefix)
	local suffixLen = mw.ustring.len(suffix)
	local i = s:find(prefix, 1, true)
	local r = s
	local j
	while i do
		j = r:find(suffix, i + prefixLen)
		if j then
			r = r:sub(1, i - 1)..r:sub(j+suffixLen)
		else
			r = r:sub(1, i - 1)
		end
		i = r:find(prefix, 1, true)
	end
	return r
end

Text.getPlain = function ( adjust )
    -- Remove wikisyntax from string, except templates
    -- Parameter:
    --     adjust  -- string
    -- Returns: string
    local r = Text.removeDelimited(adjust,"<!--","-->")
    r = r:gsub( "(</?%l[^>]*>)", "" )
         :gsub( "'''", "" )
         :gsub( "''", "" )
         :gsub( "&nbsp;", " " )
    return r
end -- Text.getPlain()



Text.isLatinRange = function ( adjust )
    -- Are characters expected to be latin or symbols within latin texts?
    -- Precondition:
    --     adjust  -- string, or nil for initialization
    -- Returns: true, if valid for latin only
    local r
    if not RangesLatin then
        RangesLatin = { {    7,  687 },
                        { 7531, 7578 },
                        { 7680, 7935 },
                        { 8194, 8250 } }
    end
    if not PatternLatin then
        local range
        PatternLatin = "^["
        for i = 1, #RangesLatin do
            range = RangesLatin[ i ]
            PatternLatin = PatternLatin ..
                           mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
        end    -- for i
        PatternLatin = PatternLatin .. "]*$"
    end
    if adjust then
        if mw.ustring.match( adjust, PatternLatin ) then
            r = true
        else
            r = false
        end
    end
    return r
end -- Text.isLatinRange()



Text.isQuote = function ( ask )
    -- Is this character any quotation mark?
    -- Parameter:
    --     ask  -- string, with single character
    -- Returns: true, if ask is quotation mark
    local r
    if not SeekQuote then
        SeekQuote = mw.ustring.char(   34,       -- "
                                       39,       -- '
                                      171,       -- laquo
                                      187,       -- raquo
                                     8216,       -- lsquo
                                     8217,       -- rsquo
                                     8218,       -- sbquo
                                     8220,       -- ldquo
                                     8221,       -- rdquo
                                     8222,       -- bdquo
                                     8249,       -- lsaquo
                                     8250,       -- rsaquo
                                     0x300C,     -- CJK
                                     0x300D,     -- CJK
                                     0x300E,     -- CJK
                                     0x300F )    -- CJK
    end
    if ask == "" then
        r = false
    elseif mw.ustring.find( SeekQuote, ask, 1, true ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.isQuote()



Text.listToText = function ( args, adapt )
    -- Format list items similar to mw.text.listToText()
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    local collect = { }
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end -- for k, v
    return mw.text.listToText( collect )
end -- Text.listToText()



Text.quote = function ( apply, alien, advance )
    -- Quote text
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code, or nil
    --     advance  -- number, with level 1 or 2, or nil
    -- Returns: quoted string
    local mode, slang
    if type( alien ) == "string" then
        slang = mw.text.trim( alien ):lower()
    else
        slang = mw.title.getCurrentTitle().pageLanguage
        if not slang then
            -- TODO FIXME: Introduction expected 2017-04
            slang = mw.language.getContentLanguage():getCode()
        end
    end
    if advance == 2 then
        mode = 2
    else
        mode = 1
    end
    return fiatQuote( mw.text.trim( apply ), slang, mode )
end -- Text.quote()



Text.quoteUnquoted = function ( apply, alien, advance )
    -- Quote text, if not yet quoted and not empty
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code, or nil
    --     advance  -- number, with level 1 or 2, or nil
    -- Returns: string; possibly quoted
    local r = mw.text.trim( apply )
    local s = mw.ustring.sub( r, 1, 1 )
    if s ~= ""  and  not Text.isQuote( s, advance ) then
        s = mw.ustring.sub( r, -1, 1 )
        if not Text.isQuote( s ) then
            r = Text.quote( r, alien, advance )
        end
    end
    return r
end -- Text.quoteUnquoted()



Text.removeDiacritics = function ( adjust )
    -- Remove all diacritics
    -- Parameter:
    --     adjust  -- string
    -- Returns: string; all latin letters should be ASCII
    --                  or basic greek or cyrillic or symbols etc.
    local cleanup, decomposed
    if not PatternCombined then
        PatternCombined = mw.ustring.char( 91,
                                            0x0300, 45, 0x036F,
                                            0x1AB0, 45, 0x1AFF,
                                            0x1DC0, 45, 0x1DFF,
                                            0xFE20, 45, 0xFE2F,
                                           93 )
    end
    decomposed = mw.ustring.toNFD( adjust )
    cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
    return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()



Text.sentenceTerminated = function ( analyse )
    -- Is string terminated by dot, question or exclamation mark?
    --     Quotation, link termination and so on granted
    -- Parameter:
    --     analyse  -- string
    -- Returns: true, if sentence terminated
    local r
    if not PatternTerminated then
        PatternTerminated = mw.ustring.char( 91,
                                             12290,
                                             65281,
                                             65294,
                                             65311 )
                            .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
    end
    if mw.ustring.find( analyse, PatternTerminated ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.sentenceTerminated()



Text.ucfirstAll = function ( adjust )
    -- Capitalize all words
    -- Precondition:
    --     adjust  -- string
    -- Returns: string with all first letters in upper case
    local r = " " .. adjust
    local i = 1
    local c, j, m
    if adjust:find( "&" ) then
        r = r:gsub( "&amp;",      "&#38;" )
             :gsub( "&lt;",       "&#60;" )
             :gsub( "&gt;",       "&#62;" )
             :gsub( "&nbsp;",    "&#160;" )
             :gsub( "&thinsp;", "&#8201;" )
             :gsub( "&zwnj;",   "&#8204;" )
             :gsub( "&zwj;",    "&#8205;" )
             :gsub( "&lrm;",    "&#8206;" )
             :gsub( "&rlm;",    "&#8207;" )
        m = true
    end
    while i do
        i = mw.ustring.find( r, "%W%l", i )
        if i then
            j = i + 1
            c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
            r = string.format( "%s%s%s",
                               mw.ustring.sub( r, 1, i ),
                               c,
                               mw.ustring.sub( r, i + 2 ) )
            i = j
        end
    end -- while i
    r = r:sub( 2 )
    if m then
        r = r:gsub(     "&#38;", "&amp;" )
             :gsub(     "&#60;", "&lt;" )
             :gsub(     "&#62;", "&gt;" )
             :gsub(    "&#160;", "&nbsp;" )
             :gsub(   "&#8201;", "&thinsp;" )
             :gsub(   "&#8204;", "&zwnj;" )
             :gsub(   "&#8205;", "&zwj;" )
             :gsub(   "&#8206;", "&lrm;" )
             :gsub(   "&#8207;", "&rlm;" )
             :gsub( "&#X(%x+);", "&#x%1;" )
    end
    return r
end -- Text.ucfirstAll()



Text.uprightNonlatin = function ( adjust )
    -- Ensure non-italics for non-latin text parts
    --     One single greek letter might be granted
    -- Precondition:
    --     adjust  -- string
    -- Returns: string with non-latin parts enclosed in <span>
    local r
    Text.isLatinRange()
    if mw.ustring.match( adjust, PatternLatin ) then
        -- latin only, horizontal dashes, quotes
        r = adjust
    else
        local c
        local j    = false
        local k    = 1
        local m    = false
        local n    = mw.ustring.len( adjust )
        local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
        local flat = function ( a )
                  -- isLatin
                  local range
                  for i = 1, #RangesLatin do
                      range = RangesLatin[ i ]
                      if a >= range[ 1 ]  and  a <= range[ 2 ] then
                          return true
                      end
                  end    -- for i
              end -- flat()
        local focus = function ( a )
                  -- char is not ambivalent
                  local r = ( a > 64 )
                  if r then
                      r = ( a < 8192  or  a > 8212 )
                  else
                      r = ( a == 38  or  a == 60 )    -- '&' '<'
                  end
                  return r
              end -- focus()
        local form = function ( a )
                return string.format( span,
                                      r,
                                      mw.ustring.sub( adjust, k, j - 1 ),
                                      mw.ustring.sub( adjust, j, a ) )
              end -- form()
        r = ""
        for i = 1, n do
            c = mw.ustring.codepoint( adjust, i, i )
            if focus( c ) then
                if flat( c ) then
                    if j then
                        if m then
                            if i == m then
                                -- single greek letter.
                                j = false
                            end
                            m = false
                        end
                        if j then
                            local nx = i - 1
                            local s  = ""
                            for ix = nx, 1, -1 do
                                c = mw.ustring.sub( adjust, ix, ix )
                                if c == " "  or  c == "(" then
                                    nx = nx - 1
                                    s  = c .. s
                                else
                                    break -- for ix
                                end
                            end -- for ix
                            r = form( nx ) .. s
                            j = false
                            k = i
                        end
                    end
                elseif not j then
                    j = i
                    if c >= 880  and  c <= 1023 then
                        -- single greek letter?
                        m = i + 1
                    else
                        m = false
                    end
                end
            elseif m then
                m = m + 1
            end
        end    -- for i
        if j  and  ( not m  or  m < n ) then
            r = form( n )
        else
            r = r .. mw.ustring.sub( adjust, k )
        end
    end
    return r
end -- Text.uprightNonlatin()



Text.test = function ( about )
    local r
    if about == "quote" then
        factoryQuote()
        r = { }
        r.QuoteLang = QuoteLang
        r.QuoteType = QuoteType
    end
    return r
end -- Text.test()



-- Export
local p = { }

function p.char( frame )
    local params = frame:getParent().args
    local story = params[ 1 ]
    local codes, lenient, multiple
    if not story then
        params = frame.args
        story  = params[ 1 ]
    end
    if story then
        local items = mw.text.split( story, "%s+" )
        if #items > 0 then
            local j
            lenient  = ( params.errors == "0" )
            codes    = { }
            multiple = tonumber( params[ "*" ] )
            for k, v in pairs( items ) do
                if v:sub( 1, 1 ) == "x" then
                    j = tonumber( "0" .. v )
                elseif v == "" then
                    v = false
                else
                    j = tonumber( v )
                end
                if v then
                    table.insert( codes,  j or v )
                end
            end -- for k, v
        end
    end
    return Text.char( codes, multiple, lenient )
end

function p.concatParams( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.concatParams( args,
                              frame.args.separator,
                              frame.args.format )
end

function p.containsCJK( frame )
    return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
end

function p.getPlain( frame )
    return Text.getPlain( frame.args[ 1 ] or "" )
end

function p.isLatinRange( frame )
    return Text.isLatinRange( frame.args[ 1 ] or "" ) and "1" or ""
end

function p.isQuote( frame )
    return Text.isQuote( frame.args[ 1 ] or "" ) and "1" or ""
end



function p.listToFormat(frame)
    local lists = {}
    local pformat = frame.args["format"]
    local sep = frame.args["sep"] or ";"

    -- Parameter parsen: Listen
    for k, v in pairs(frame.args) do
        local knum = tonumber(k)
        if knum then lists[knum] = v end
    end

    -- Listen splitten
    local maxListLen = 0
    for i = 1, #lists do
        lists[i] = mw.text.split(lists[i], sep)
        if #lists[i] > maxListLen then maxListLen = #lists[i] end
    end

    -- Ergebnisstring generieren
    local result = ""
    local result_line = ""
    for i = 1, maxListLen do
        result_line = pformat
        for j = 1, #lists do
            result_line = mw.ustring.gsub(result_line, "%%s", lists[j][i], 1)
        end
        result = result .. result_line
    end

    return result
end



function p.listToText( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.listToText( args, frame.args.format )
end



function p.quote( frame )
    local slang = frame.args[2]
    if type( slang ) == "string" then
        slang = mw.text.trim( slang )
        if slang == "" then
            slang = false
        end
    end
    return Text.quote( frame.args[ 1 ] or "",
                       slang,
                       tonumber( frame.args[3] ) )
end



function p.quoteUnquoted( frame )
    local slang = frame.args[2]
    if type( slang ) == "string" then
        slang = mw.text.trim( slang )
        if slang == "" then
            slang = false
        end
    end
    return Text.quoteUnquoted( frame.args[ 1 ] or "",
                               slang,
                               tonumber( frame.args[3] ) )
end



function p.removeDiacritics( frame )
    return Text.removeDiacritics( frame.args[ 1 ] or "" )
end

function p.sentenceTerminated( frame )
    return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or ""
end

function p.ucfirstAll( frame )
    return Text.ucfirstAll( frame.args[ 1 ] or "" )
end

function p.uprightNonlatin( frame )
    return Text.uprightNonlatin( frame.args[ 1 ] or "" )
end



function p.zip(frame)
    local lists = {}
    local seps = {}
    local defaultsep = frame.args["sep"] or ""
    local innersep = frame.args["isep"] or ""
    local outersep = frame.args["osep"] or ""

    -- Parameter parsen
    for k, v in pairs(frame.args) do
        local knum = tonumber(k)
        if knum then lists[knum] = v else
            if string.sub(k, 1, 3) == "sep" then
                local sepnum = tonumber(string.sub(k, 4))
                if sepnum then seps[sepnum] = v end
            end
        end
    end
    -- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
    for i = 1, math.max(#seps, #lists) do
        if not seps[i] then seps[i] = defaultsep end
    end

    -- Listen splitten
    local maxListLen = 0
    for i = 1, #lists do
        lists[i] = mw.text.split(lists[i], seps[i])
        if #lists[i] > maxListLen then maxListLen = #lists[i] end
    end

    local result = ""
    for i = 1, maxListLen do
        if i ~= 1 then result = result .. outersep end
        for j = 1, #lists do
            if j ~= 1 then result = result .. innersep end
            result = result .. (lists[j][i] or "")
        end
    end
    return result
end



function p.failsafe()
    return Text.serial
end



p.Text = function ()
    return Text
end -- p.Text

return p