The logic of encoding Unicode in UTF-8 is basically:
Here's a function I wrote a while back for encoding a JavaScript UTF-16 string in UTF-8:
function toUTF8Array(str) {
var utf8 = [];
for (var i=0; i < str.length; i++) {
var charcode = str.charCodeAt(i);
if (charcode < 0x80) utf8.push(charcode);
else if (charcode < 0x800) {
utf8.push(0xc0 | (charcode >> 6),
0x80 | (charcode & 0x3f));
}
else if (charcode < 0xd800 || charcode >= 0xe000) {
utf8.push(0xe0 | (charcode >> 12),
0x80 | ((charcode>>6) & 0x3f),
0x80 | (charcode & 0x3f));
}
// surrogate pair
else {
i++;
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charcode = 0x10000 + (((charcode & 0x3ff)<<10)
| (str.charCodeAt(i) & 0x3ff));
utf8.push(0xf0 | (charcode >>18),
0x80 | ((charcode>>12) & 0x3f),
0x80 | ((charcode>>6) & 0x3f),
0x80 | (charcode & 0x3f));
}
}
return utf8;
}
JavaScript String
s are stored in UTF-16. To get UTF-8, you'll have to convert the String
yourself.
One way is to mix encodeURIComponent()
, which will output UTF-8 bytes URL-encoded, with unescape
, as mentioned on ecmanaut.
var utf8 = unescape(encodeURIComponent(str));
var arr = [];
for (var i = 0; i < utf8.length; i++) {
arr.push(utf8.charCodeAt(i));
}
The Encoding API lets you both encode and decode UTF-8 easily (using typed arrays):
var encoded = new TextEncoder().encode("Γεια σου κόσμε");
var decoded = new TextDecoder("utf-8").decode(encoded);
console.log(encoded, decoded);
Browser support isn't too bad, and there's a polyfill that should work in IE11 and older versions of Edge.
While TextEncoder
can only encode to UTF-8, TextDecoder
supports other encodings. I used it to decode Japanese text (Shift-JIS) in this way:
// Shift-JIS encoded text; must be a byte array due to values 129 and 130.
var arr = [130, 108, 130, 102, 130, 80, 129, 64, 130, 102, 130, 96, 130, 108, 130, 100,
129, 64, 130, 99, 130, 96, 130, 115, 130, 96, 129, 124, 130, 79, 130, 80];
// Convert to byte array
var data = new Uint8Array(arr);
// Decode with TextDecoder
var decoded = new TextDecoder("shift-jis").decode(data.buffer);
console.log(decoded);
The Google Closure library has functions to convert to/from UTF-8 and byte arrays. If you don't want to use the whole library, you can copy the functions from here. For completeness, the code to convert to a string to a UTF-8 byte array is:
goog.crypt.stringToUtf8ByteArray = function(str) {
// TODO(user): Use native implementations if/when available
var out = [], p = 0;
for (var i = 0; i < str.length; i++) {
var c = str.charCodeAt(i);
if (c < 128) {
out[p++] = c;
} else if (c < 2048) {
out[p++] = (c >> 6) | 192;
out[p++] = (c & 63) | 128;
} else if (
((c & 0xFC00) == 0xD800) && (i + 1) < str.length &&
((str.charCodeAt(i + 1) & 0xFC00) == 0xDC00)) {
// Surrogate Pair
c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF);
out[p++] = (c >> 18) | 240;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else {
out[p++] = (c >> 12) | 224;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
}
}
return out;
};
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With