Retrieving binary file content using Javascript, base64 encode it and reverse-decode it using Python

I'm trying to download a binary file using XMLHttpRequest (using a recent Webkit) and base64-encode its contents using this simple function:

function getBinary(file){     var xhr = new XMLHttpRequest();       xhr.open("GET", file, false);       xhr.overrideMimeType("text/plain; charset=x-user-defined");       xhr.send(null);     return xhr.responseText; }  function base64encode(binary) {     return btoa(unescape(encodeURIComponent(binary))); }  var binary = getBinary('http://some.tld/sample.pdf'); var base64encoded = base64encode(binary); 

As a side note, everything above is standard Javascript stuff, including btoa() and encodeURIComponent(): https://developer.mozilla.org/en/DOM/window.btoa

This works pretty smoothly, and I can even decode the base64 contents using Javascript:

function base64decode(base64) {     return decodeURIComponent(escape(atob(base64))); }  var decodedBinary = base64decode(base64encoded); decodedBinary === binary // true 

Now, I want to decode the base64-encoded contents using Python which consume some JSON string to get the base64encoded string value. Naively this is what I do:

import urllib import base64 # ... retrieving of base64 encoded string through JSON base64 = "77+9UE5HDQ……………oaCgA=" source_contents = urllib.unquote(base64.b64decode(base64)) destination_file = open(destination, 'wb') destination_file.write(source_contents) destination_file.close() 

But the resulting file is invalid, looks like the operation's messaed up with UTF-8, encoding or something which is still unclear to me.

If I try to decode UTF-8 contents before putting them in the destination file, an error is raised:

import urllib import base64 # ... retrieving of base64 encoded string through JSON base64 = "77+9UE5HDQ……………oaCgA=" source_contents = urllib.unquote(base64.b64decode(base64)).decode('utf-8') destination_file = open(destination, 'wb') destination_file.write(source_contents) destination_file.close()  $ python test.py // ... UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 0: ordinal not in range(128) 

As a side note, here's a screenshot of two textual representations of a same file; on left: the original; on right: the one created from the base64-decoded string: http://cl.ly/0U3G34110z3c132O2e2x

Is there a known trick to circumvent these problems with encoding when attempting to recreating the file? How would you achieve this yourself?

Any help or hint much appreciated :)

1 Answers

So I'm answering to myself — and sorry for that — but I think it might be useful for someone as lost as I was ;)

So you have to use ArrayBuffer and set the responseType property of your XMLHttpRequest object instance to arraybuffer for retrieving a native array of Bytes, which can be converted to base64 using the following convenient function (found there, author may be blessed here):

function base64ArrayBuffer(arrayBuffer) {   var base64    = ''   var encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'    var bytes         = new Uint8Array(arrayBuffer)   var byteLength    = bytes.byteLength   var byteRemainder = byteLength % 3   var mainLength    = byteLength - byteRemainder    var a, b, c, d   var chunk    // Main loop deals with bytes in chunks of 3   for (var i = 0; i < mainLength; i = i + 3) {     // Combine the three bytes into a single integer     chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2]      // Use bitmasks to extract 6-bit segments from the triplet     a = (chunk & 16515072) >> 18 // 16515072 = (2^6 - 1) << 18     b = (chunk & 258048)   >> 12 // 258048   = (2^6 - 1) << 12     c = (chunk & 4032)     >>  6 // 4032     = (2^6 - 1) << 6     d = chunk & 63               // 63       = 2^6 - 1      // Convert the raw binary segments to the appropriate ASCII encoding     base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d]   }    // Deal with the remaining bytes and padding   if (byteRemainder == 1) {     chunk = bytes[mainLength]      a = (chunk & 252) >> 2 // 252 = (2^6 - 1) << 2      // Set the 4 least significant bits to zero     b = (chunk & 3)   << 4 // 3   = 2^2 - 1      base64 += encodings[a] + encodings[b] + '=='   } else if (byteRemainder == 2) {     chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1]      a = (chunk & 64512) >> 10 // 64512 = (2^6 - 1) << 10     b = (chunk & 1008)  >>  4 // 1008  = (2^6 - 1) << 4      // Set the 2 least significant bits to zero     c = (chunk & 15)    <<  2 // 15    = 2^4 - 1      base64 += encodings[a] + encodings[b] + encodings[c] + '='   }    return base64 } 

So here's a working code:

var xhr = new XMLHttpRequest(); xhr.open('GET', 'http://some.tld/favicon.png', false); xhr.responseType = 'arraybuffer'; xhr.onload = function(e) {     console.log(base64ArrayBuffer(e.currentTarget.response)); }; xhr.send(); 

This will log a valid base64 encoded string representing the binary file contents.

Edit: For older browsers not having access to ArrayBuffer and having btoa() failing on encoding characters, here's another way to get a base64 encoded version of any binary:

function getBinary(file){     var xhr = new XMLHttpRequest();     xhr.open("GET", file, false);     xhr.overrideMimeType("text/plain; charset=x-user-defined");     xhr.send(null);     return xhr.responseText; }  function base64Encode(str) {     var CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";     var out = "", i = 0, len = str.length, c1, c2, c3;     while (i < len) {         c1 = str.charCodeAt(i++) & 0xff;         if (i == len) {             out += CHARS.charAt(c1 >> 2);             out += CHARS.charAt((c1 & 0x3) << 4);             out += "==";             break;         }         c2 = str.charCodeAt(i++);         if (i == len) {             out += CHARS.charAt(c1 >> 2);             out += CHARS.charAt(((c1 & 0x3)<< 4) | ((c2 & 0xF0) >> 4));             out += CHARS.charAt((c2 & 0xF) << 2);             out += "=";             break;         }         c3 = str.charCodeAt(i++);         out += CHARS.charAt(c1 >> 2);         out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));         out += CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));         out += CHARS.charAt(c3 & 0x3F);     }     return out; }  console.log(base64Encode(getBinary('http://www.google.fr/images/srpr/logo3w.png'))); 

Hope this helps others as it did for me.

