Don't click here unless you want to be banned.

LSL Wiki : Unicode

HomePage :: PageIndex :: RecentChanges :: RecentlyCommented :: UserSettings :: You are crawl338.us.archive.org

Unicode


What it is
Computers primarily deal with numbers. Any character or graphic that you see is represented by an internal numeric code. The set of codes that represents a given set of characters is referred to as a character-set. Computers use the codes to find the correct symbol in a font, for display.

Traditional computer character sets comprised only 127 different codes (including some that had special meaning and did not directly translate to a symbol). This was fine for the display terminals at the time, but internationalisation required more. ISO-8859-1 (aka Latin-1) expanded on the available symbols, but human languages comprise a set of literally tens of thousands of different symbols.

Unicode was originally standardised in 1991 (by Unicode, Inc.), and has undergone several revisions since then. At last count it supports more than 90,000 codes, with more being added. While several standards currently compete for technical, historical and political reasons, Unicode is the most complete character code representation available at the present time. For a more indepth history of Unicode try here.

What does it mean to me?
You may have noticed some of the SL residents chat in languages other than english. Some of our Japanese residents, for example are chatting in their native language, with their native character sets. If you have the proper font installed you will see the characters in their native japanese character set. If you do not, you'll see a series of small grey blocks in your chat history (your operating system's way of telling you that it cannot find a font which contains symbols that match the character codes).

Not every part of SL is happy with unicode. While some years old now, Unicode adoption has been slow. Generally you will find that unicode functions with Chat, IM, and the email/IM gateway. Other parts, like the names of objects, inventory search and such may not understand characters typed that are not Latin-1

How Unicode is implemented in SL
The encoding standard that SL uses is UTF-8, raw UTF-8 strings can be built with the use of llUnescapeURL. SL isn't fully compliant with UTF-8 yet.

Functions
These two functions will make you wish you were never born.
integer UTF8ToUnicodeInteger(string character)
returns the unicode value of the character. If the character is not part of the utf-8 character set or is invalid the function returns the first byte and sets the negative bit.
string UnicodeIntegerToUTF8(integer unicode)
returns the utf-8 encoded version of the integer unicode. Standard Compliant.
integer UTF8ToUnicodeInteger(string a)
{
    integer b;
    if(a = llEscapeURL(llGetSubString(a,0,0)))
    {
        if(1 == b = llStringLength(a))
        {
            if(48 == (b = 48 + (integer)a))
                b = (!!b << 4) + 48 + b = llSubStringIndex("0ABCDEFGHIJKLMNOPQRSTUVWXYZ      abcdefghijklmnopqrstuvwxyz",a);
        }
        else if(!(b % 3))
        {
            a = (string)llParseString2List(a,["%"],[]);
            if(b == 3)
                b = (integer)("0x"+a);
            else 
            {
                b = ((b & 0x3f) |
                    ((b & 0x3f00) >> 2) | 
                    ((b & 0x3f0000) >> 4) | 
                    (((b = (integer)("0x"+llGetSubString(a,-8,-1))) & 0x3f000000) >> 6) | 
                    ((b & 0x3f) << 24) | 
                    (((b = (integer)("0x"+llDeleteSubString(a,-8,-1))) & 0x100) << 22)) &
                    (0x7FFFFFFF >> (30 - (5 * (b/3))));
            }
        }
    }
    return b;
}

string byte2hex(integer x) 
{
    string hexc="0123456789ABCDEF";
    return llGetSubString(hexc, x = ((x >> 4) & 0xF), x) + llGetSubString(hexc, (x & 0xF), (x & 0xF));
}
 
string UnicodeIntegerToUTF8(integer a)
{
    if(a <= 0) return "";//unicode & utf8 only support 2^31 characters, not 2^32; so no negitives.
    integer b = (a >= 0x80) + (a >= 0x800) + (a >= 0x10000) + (a >= 0x200000) + (a >= 0x4000000);
    string c = "%" + byte2hex((a >> (6 * b)) | ((0x7F80 >> b) << !b));
    while(b)
        c += "%" + byte2hex((((a >> (6 * (b=~-b))) | 0x80) & 0xBF));
    return llUnescapeURL(c);
}

These functions are attempts at obtaining the number of bytes in a UTF8 string:
integer UTF8Length(string msg)
{
//Simple and efficient!
//Release to the public domain by kimmie Loveless
    integer rNum = llStringLength(msg);
    return rNum + ((llStringLength(llEscapeURL(msg)) - rNum)/4);
}

I'm pretty sure you cannot scan for that many tokens all at once; not to mention that llEscapeURL is borked. -BW
// Memory intensive, but fast. 
// TODO: Overcome llParseStringKeepNulls/llParseString2List's seperators/spacers limitations.
// TODO: Check for off by 1 errors.
integer stringBytes(string n) {
    n = llEscapeURL(n);
    // tokens list contains all combinations of %XX that llEscapeURL can output.
    list tokens = ["%01","%02","%03","%04","%05","%06","%07","%08","%09","%0A","%0B","%0C","%0D","%0E","%0F","%10","%11","%12","%13","%14","%15","%16","%17","%18","%19","%1A","%1B","%1C","%1D","%1E","%1F","%20","%21","%22","%23","%24","%25","%26","%27","%28","%29","%2A","%2B","%2C","%2D","%2E","%2F","%3A","%3B","%3C","%3D","%3E","%3F","%40","%5B","%5C","%5D","%5E","%5F","%60","%7B","%7C","%7D","%7E","%7F","%80"]+["%81","%82","%83","%84","%85","%86","%87","%88","%89","%8A","%8B","%8C","%8D","%8E","%8F","%90","%91","%92","%93","%94","%95","%96","%97","%98","%99","%9A","%9B","%9C","%9D","%9E","%9F","%A0","%A1","%A2","%A3","%A4","%A5","%A6","%A7","%A8","%A9","%AA","%AB","%AC","%AD","%AE","%AF","%B0","%B1","%B2","%B3","%B4","%B5","%B6","%B7","%B8","%B9","%BA","%BB","%BC","%BD","%BE","%BF","%C0","%C1"]+["%C2","%C3","%C4","%C5","%C6","%C7","%C8","%C9","%CA","%CB","%CC","%CD","%CE","%CF","%D0","%D1","%D2","%D3","%D4","%D5","%D6","%D7","%D8","%D9","%DA","%DB","%DC","%DD","%DE","%DF","%E0","%E1","%E2","%E3","%E4","%E5","%E6","%E7","%E8","%E9","%EA","%EB","%EC","%ED","%EE","%EF","%F0","%F1","%F2","%F3","%F4","%F5","%F6","%F7","%F8","%F9","%FA","%FB","%FC","%FD","%FE","%FF"];
    // The null strings in this list represent the number of %XX tokens that were parsed out.
    integer withNullLength = llGetListLength(llParseStringKeepNulls(n, tokens, []));
    list noNulls = llParseString2List(n, tokens, []);
    return llStringLength((string)noNulls) // The 1-byte chars that weren't escaped
    + withNullLength - llGetListLength(noNulls); // Number of bytes in escaped chars.
}

Here's another test, showing that ASCII chars take 1 byte each, Latin-1 2 bytes each.

// Change this to 1 for ASCII, 2 for Latin-1.
integer TEST = 1;

test1() {
    integer i;
    string s1;
    integer before1;
    integer after1;

    before1 = llGetFreeMemory();
    for (i = 0; i < 100; ++i) {
        s1 = (s1 = "") + s1 + "1";
    }
    after1 = llGetFreeMemory();

    llOwnerSay((string)llStringLength(s1)+" ASCII = " + (string)(before1 - after1) + " bytes");
}

test2() {
    integer i;
    string s2;
    integer before2;
    integer after2;

    before2 = llGetFreeMemory();
    for (i = 0; i < 100; ++i) {
        s2 = (s2 = "") + s2 + "�";
    }
    after2 = llGetFreeMemory();

    llOwnerSay((string)llStringLength(s2)+" Latin-1 = " + (string)(before2 - after2) + " bytes");
}

default
{
    state_entry()
    {
        llSay(0, "Touch to start test "+(string)TEST);
    }

    touch_start(integer total_number)
    {
        if (TEST == 1) {
            test1();
        } else if (TEST == 2) {
            test2();
        }
    }
}
There are 6 comments on this page. [Display comments/form]