Previous Page
Next Page

4.10. Using UTF-8 with JavaScript

Modern browsers have Unicode support built right into JavaScriptthe basic String class stores code points rather than bytes, and the string manipulation functions work correctly. When you copy data in and out of forms using JavaScript, the data that finally gets submitted is UTF-8 (assuming you specified that for the page's encoding type).

The only thing to watch out for is that the built-in function escape( ), which is used to format strings for inclusion in a URL, does not support Unicode characters. This means that if you want to let users input text that you'll then build a URL from (such as building a GET query string), then you can't use escape( ).

Luckily, since JavaScript supports code points natively and allows you to query them using the String.getCodeAt( ) method, you can fairly easily write your own UTF-8-safe escaping function:

function escape_utf8(data) {
        if (data == '' || data == null){
               return '';
        }
       data = data.toString( );
       var buffer = '';
       for(var i=0; i<data.length; i++){
               var c = data.charCodeAt(i);
               var bs = new Array( );
              if (c > 0x10000){
                       // 4 bytes
                       bs[0] = 0xF0 | ((c & 0x1C0000) >>> 18);
                       bs[1] = 0x80 | ((c & 0x3F000) >>> 12);
                       bs[2] = 0x80 | ((c & 0xFC0) >>> 6);
                   bs[3] = 0x80 | (c & 0x3F);
               }else if (c > 0x800){
                        // 3 bytes
                        bs[0] = 0xE0 | ((c & 0xF000) >>> 12);
                        bs[1] = 0x80 | ((c & 0xFC0) >>> 6);
                       bs[2] = 0x80 | (c & 0x3F);
             }else if (c > 0x80){
                      // 2 bytes
                       bs[0] = 0xC0 | ((c & 0x7C0) >>> 6);
                      bs[1] = 0x80 | (c & 0x3F);
               }else{
                       // 1 byte
                    bs[0] = c;
              }
             for(var j=0; j<bs.length; j++){
                      var b = bs[j];
                       var hex = nibble_to_hex((b & 0xF0) >>> 4) 
                      + nibble_to_hex(b &0x0F);buffer += '%'+hex;
              }
    }
    return buffer;
}
function nibble_to_hex(nibble){
        var chars = '0123456789ABCDEF';
        return chars.charAt(nibble);
}

The escape_utf8( ) function works by iterating over each code point in the string, creating a UTF-8 byte stream. It then loops over the bytes in this stream, formatting each one using the %XX format for escaping bytes in a URL. A further improvement to this function would be to leave alphanumeric characters as-is in the escaped version of the string, so that the returned values are easily readable in the common case.


Previous Page
Next Page