TGC Codebase Backup



Unicode UTF by Scary Little Rabbit

14th Sep 2016 16:04
Summary

analogues for len(), mid(), left(), right(), asc(), chr() functions to work with UTF-8 strings.



Description



Code
                                    ` This code was downloaded from The Game Creators
                                    ` It is reproduced here with full permission
                                    ` http://www.thegamecreators.com
                                    
                                    
` project: UTF8
` created: 2016-09-13
` license: MIT
` abstract: analogues for len(), mid(), left(), right(), asc(), chr() functions to work with UTF-8 strings

` copyright (c) 2016, Simon Grim

` tip me via bitcoin:1C5NZCMkjJTf8v7t41QwW9EeCjJcLEbf5s or https://paypal.me/harder


` to use strLen(), strMid(), strLeft(), strRight(), getUTF(), chrUTF() instead those internal AGK commands
` place two following lines to your initialization code (uncomment them of course)
` #include 'UTF8.agc'
`     initUTF8()


type UTF8Type
    ID      as integer
    char    as string
    ` bytes   as integer[]  ` uncomment if you need to use it somewhere
endtype


#constant _chrUnknownID 63  ` ID of '?' char

global _intMemblock             as integer
global _strLength               as integer
global _strMemblock             as integer
global _str                     as string
global _strMemblockCharOffsets  as integer[0]
global _UTF8                    as UTF8Type[0]


function initUTF8()
    _intMemblock = createMemblock(4)
    _strMemblock = createMemblock(4)

    ` prepare _UTF8 cache (for runtime perfomance optimization)
    ` tab
        newUTF8(9)
    ` latin and some basic punctuation
    for b = 32 to 126
        newUTF8(b)
    next
    ` some latin with diacritics and additional punctuation (Unicode block LATIN_1_SUPPLEMENT)
    for b = 160 to 191
        newUTF8(int2(194, b))
    next
    for b = 128 to 191
        newUTF8(int2(195, b))
    next
    ` some cyrillic (Unicode block CYRILLIC)
    for b = 128 to 191
        newUTF8(int2(208, b))
    next
    for b = 128 to 159
        newUTF8(int2(209, b))
    next
endfunction

function strLen(s as string)
    if s = '' then exitfunction 0

    strUni(s)
endfunction _strLength

function strUni(s as string)
    if s = _str then exitfunction _strMemblockCharOffsets[1] > 0

    sl = len(s)

    if sl > _strMemblockCharOffsets.length then _strMemblockCharOffsets.length = sl  ` yep, sl value is too high but anyway

    _str = s
    _strLength = 0

    deleteMemblock(_strMemblock)
    createMemblock(_strMemblock, sl + 1)
        setMemblockString(_strMemblock, 0, s)

    i = 0
    while i < sl
        b = getMemblockByte(_strMemblock, i)
        if b < 128
            inc i
            inc _strLength
                _strMemblockCharOffsets[_strLength] = i
        elseif b > 191
            if b < 224
                inc i, 2
                if i > sl
                    _strLength = sl : exit
                else
                    inc _strLength
                        _strMemblockCharOffsets[_strLength] = i
                endif
            elseif b < 240
                inc i, 3
                if i > sl
                    _strLength = sl : exit
                else
                    inc _strLength
                        _strMemblockCharOffsets[_strLength] = i
                endif
            elseif b < 248
                inc i, 4
                if i > sl
                    _strLength = sl : exit
                else
                    inc _strLength
                        _strMemblockCharOffsets[_strLength] = i
                endif
            else  ` it's not Unicode char
                _strLength = sl : exit
            endif
        else  ` it's not Unicode char
            _strLength = sl : exit
        endif
    endwhile

    if _strLength = sl
        _strMemblockCharOffsets[1] = 0 : exitfunction 0
    endif
endfunction 1

function strMid(s as string, p as integer, l as integer)
    res as string

    if s = '' or p < 1 or l = 0 then exitfunction ''

    if strUni(s) = 0 then exitfunction mid(s, p, l)

    dec p
    if l > 0
        l = p + l
            if l > _strLength then l = _strLength
    else
        l = _strLength
    endif
    p = _strMemblockCharOffsets[p]
        res = getMemblockString(_strMemblock, p, _strMemblockCharOffsets[l] - p)
endfunction res

function strLeft(s as string, l as integer)
    res as string

    if s = '' or l < 1 then exitfunction ''

    if strUni(s) = 0 then exitfunction left(s, l)

    if l > _strLength then l = _strLength
        res = getMemblockString(_strMemblock, 0, _strMemblockCharOffsets[l])
endfunction res

function strRight(s as string, l as integer)
    res as string

    if s = '' or l < 1 then exitfunction ''

    if strUni(s) = 0 then exitfunction right(s, l)

    p = _strLength - l
        if p <= 0
            p = 0
        else
            p = _strMemblockCharOffsets[p]
        endif
        res = getMemblockString(_strMemblock, p, _strMemblockCharOffsets[_strLength] - p)
endfunction res

function newUTF8(r as integer)
    bytes   as integer[]
    x       as UTF8Type

    if r = 0 then exitfunction ''

    setMemblockInt(_intMemblock, 0, r)
    for i = 0 to 3
        b = getMemblockByte(_intMemblock, i)
            if bytes.length > -1 or b > 0 then bytes.insert(b)
    next

    ` WARN we check only first byte here and everywhere; fix it if you're paranoid
    if bytes[0] < 192
        if bytes[0] > 127 then exitfunction ''
    else
        if bytes[0] < 224
            if bytes.length <> 1 then exitfunction ''
        elseif bytes[0] < 240
            if bytes.length <> 2 then exitfunction ''
        elseif bytes[0] < 248
            if bytes.length <> 3 then exitfunction ''
        else
            exitfunction ''
        endif
    endif

    x.ID = r
    x.char = getMemblockString(_intMemblock, 3 - bytes.length, bytes.length + 1)
    ` x.bytes = bytes  ` uncomment if you need to use it somewhere
        _UTF8.insertSorted(x)
endfunction x.char

function chrUTF8(r as integer)
    res as string

    i = _UTF8.find(r)
        if i > -1 then exitfunction _UTF8[i].char

    res = newUTF8(r)
endfunction res

function getUTF8(s as string)
    sl = len(s)

    if sl = 0 then exitfunction 0

    m = createMemblock(sl + 1)
        setMemblockString(m, 0, s)

    b = getMemblockByte(m, 0)
        if b < 128
            i = b
        elseif b > 191
            if b < 224
                if sl > 0
                    i = int2(b, getMemblockByte(m, 1))
                else
                    i = _chrUnknownID
                endif
            elseif b < 240
                if sl > 1
                    i = int3(b, getMemblockByte(m, 1), getMemblockByte(m, 2))
                else
                    i = _chrUnknownID
                endif
            elseif b < 248
                if sl > 2
                    i = int4(b, getMemblockByte(m, 1), getMemblockByte(m, 2), getMemblockByte(m, 3))
                else
                    i = _chrUnknownID
                endif
            else  ` it's not Unicode char
                i = _chrUnknownID
            endif
        else  ` it's not Unicode char
            i = _chrUnknownID
        endif

        deleteMemblock(m)
endfunction i

function int2(b as integer, a as integer)
    setMemblockByte(_intMemblock, 0, 0)
    setMemblockByte(_intMemblock, 1, 0)
    setMemblockByte(_intMemblock, 2, b)
    setMemblockByte(_intMemblock, 3, a)

    i = getMemblockInt(_intMemblock, 0)
endfunction i

function int3(c as integer, b as integer, a as integer)
    setMemblockByte(_intMemblock, 0, 0)
    setMemblockByte(_intMemblock, 1, c)
    setMemblockByte(_intMemblock, 2, b)
    setMemblockByte(_intMemblock, 3, a)

    i = getMemblockInt(_intMemblock, 0)
endfunction i

function int4(d as integer, c as integer, b as integer, a as integer)
    setMemblockByte(_intMemblock, 0, d)
    setMemblockByte(_intMemblock, 1, c)
    setMemblockByte(_intMemblock, 2, b)
    setMemblockByte(_intMemblock, 3, a)

    i = getMemblockInt(_intMemblock, 0)
endfunction i