Decode UTF-8 with Javascript

Question

I have Javascript in an XHTML web page that is passing UTF-8 encoded strings. It needs to continue to pass the UTF-8 version, as well as decode it. How is it possible to decode a UTF-8 string for display?

<script type="text/javascript">
// <![CDATA[
function updateUser(usernameSent){
    var usernameReceived = usernameSent; // Current value: GrÃƒÂ¶ÃƒÂŸe
    var usernameDecoded = usernameReceived;  // Decode to: Größe
    var html2id = '';
    html2id += 'Encoded: ' + usernameReceived + '<br />Decoded: ' + usernameDecoded;
    document.getElementById('userId').innerHTML = html2id;
}
// ]]>
</script>

CpnCrunch · Accepted Answer

To answer the original question: here is how you decode utf-8 in javascript:

http://ecmanaut.blogspot.ca/2006/07/encoding-decoding-utf8-in-javascript.html

Specifically,

function encode_utf8(s) {
  return unescape(encodeURIComponent(s));
}

function decode_utf8(s) {
  return decodeURIComponent(escape(s));
}

We have been using this in our production code for 6 years, and it has worked flawlessly.

Note, however, that escape() and unescape() are deprecated. See this.

Albert · Answer

This should work:

// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt

/* utf.js - UTF-8 <=> UTF-16 convertion
 *
 * Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
 * Version: 1.0
 * LastModified: Dec 25 1999
 * This library is free.  You can redistribute it and/or modify it.
 */

function Utf8ArrayToStr(array) {
    var out, i, len, c;
    var char2, char3;

    out = "";
    len = array.length;
    i = 0;
    while(i < len) {
    c = array[i++];
    switch(c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += String.fromCharCode(c);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = array[i++];
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        out += String.fromCharCode(((c & 0x0F) << 12) |
                       ((char2 & 0x3F) << 6) |
                       ((char3 & 0x3F) << 0));
        break;
    }
    }

    return out;
}

Check out the JSFiddle demo.

Also see the related questions: here and here

Jonathan · Answer

Perhaps using the textDecoder will be sufficient.

Not supported in IE though.

var decoder = new TextDecoder('utf-8'),
    decodedMessage;

decodedMessage = decoder.decode(message.data);

Handling non-UTF8 text

In this example, we decode the Russian text "Привет, мир!", which means "Hello, world." In our TextDecoder() constructor, we specify the Windows-1251 character encoding, which is appropriate for Cyrillic script.

    let win1251decoder = new TextDecoder('windows-1251');
    let bytes = new Uint8Array([207, 240, 232, 226, 229, 242, 44, 32, 236, 232, 240, 33]);
    console.log(win1251decoder.decode(bytes)); // Привет, мир!

The interface for the TextDecoder is described here.

Retrieving a byte array from a string is equally simpel:

const decoder = new TextDecoder();
const encoder = new TextEncoder();

const byteArray = encoder.encode('Größe');
// converted it to a byte array

// now we can decode it back to a string if desired
console.log(decoder.decode(byteArray));

If you have it in a different encoding then you must compensate for that upon encoding. The parameter in the constructor for the TextEncoder is any one of the valid encodings listed here.

lauthu · Answer

Update @Albert's answer adding condition for emoji.

function Utf8ArrayToStr(array) {
    var out, i, len, c;
    var char2, char3, char4;

    out = "";
    len = array.length;
    i = 0;
    while(i < len) {
    c = array[i++];
    switch(c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += String.fromCharCode(c);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = array[i++];
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        out += String.fromCharCode(((c & 0x0F) << 12) |
                       ((char2 & 0x3F) << 6) |
                       ((char3 & 0x3F) << 0));
        break;
     case 15:
        // 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        char4 = array[i++];
        out += String.fromCodePoint(((c & 0x07) << 18) | ((char2 & 0x3F) << 12) | ((char3 & 0x3F) << 6) | (char4 & 0x3F));

        break;
    }

    return out;
}

Matthew Voss · Answer

Here is a solution handling all Unicode code points include upper (4 byte) values and supported by all modern browsers (IE and others > 5.5). It uses decodeURIComponent(), but NOT the deprecated escape/unescape functions:

function utf8_to_str(a) {
    for(var i=0, s=''; i<a.length; i++) {
        var h = a[i].toString(16)
        if(h.length < 2) h = '0' + h
        s += '%' + h
    }
    return decodeURIComponent(s)
}

Tested and available on GitHub

To create UTF-8 from a string:

function utf8_from_str(s) {
    for(var i=0, enc = encodeURIComponent(s), a = []; i < enc.length;) {
        if(enc[i] === '%') {
            a.push(parseInt(enc.substr(i+1, 2), 16))
            i += 3
        } else {
            a.push(enc.charCodeAt(i++))
        }
    }
    return a
}

Tested and available on GitHub

Decode UTF-8 with Javascript

Tags:

javascript

unicode

xhtml-transitional

utf8-decode

Jarrett Mattson

5 Answers

CpnCrunch

Albert

Handling non-UTF8 text

Jonathan

lauthu

Matthew Voss

Recent Activity

Donate For Us

Decode UTF-8 with Javascript

Tags:

javascript

unicode

xhtml-transitional

utf8-decode

Jarrett Mattson

5 Answers

CpnCrunch

Albert

Handling non-UTF8 text

Jonathan

lauthu

Matthew Voss

Related questions

Recent Activity

Donate For Us