Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Retrieving binary file content using Javascript, base64 encode it and reverse-decode it using Python

I'm trying to download a binary file using XMLHttpRequest (using a recent Webkit) and base64-encode its contents using this simple function:

function getBinary(file){     var xhr = new XMLHttpRequest();       xhr.open("GET", file, false);       xhr.overrideMimeType("text/plain; charset=x-user-defined");       xhr.send(null);     return xhr.responseText; }  function base64encode(binary) {     return btoa(unescape(encodeURIComponent(binary))); }  var binary = getBinary('http://some.tld/sample.pdf'); var base64encoded = base64encode(binary); 

As a side note, everything above is standard Javascript stuff, including btoa() and encodeURIComponent(): https://developer.mozilla.org/en/DOM/window.btoa

This works pretty smoothly, and I can even decode the base64 contents using Javascript:

function base64decode(base64) {     return decodeURIComponent(escape(atob(base64))); }  var decodedBinary = base64decode(base64encoded); decodedBinary === binary // true 

Now, I want to decode the base64-encoded contents using Python which consume some JSON string to get the base64encoded string value. Naively this is what I do:

import urllib import base64 # ... retrieving of base64 encoded string through JSON base64 = "77+9UE5HDQ……………oaCgA=" source_contents = urllib.unquote(base64.b64decode(base64)) destination_file = open(destination, 'wb') destination_file.write(source_contents) destination_file.close() 

But the resulting file is invalid, looks like the operation's messaed up with UTF-8, encoding or something which is still unclear to me.

If I try to decode UTF-8 contents before putting them in the destination file, an error is raised:

import urllib import base64 # ... retrieving of base64 encoded string through JSON base64 = "77+9UE5HDQ……………oaCgA=" source_contents = urllib.unquote(base64.b64decode(base64)).decode('utf-8') destination_file = open(destination, 'wb') destination_file.write(source_contents) destination_file.close()  $ python test.py // ... UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 0: ordinal not in range(128) 

As a side note, here's a screenshot of two textual representations of a same file; on left: the original; on right: the one created from the base64-decoded string: http://cl.ly/0U3G34110z3c132O2e2x

Is there a known trick to circumvent these problems with encoding when attempting to recreating the file? How would you achieve this yourself?

Any help or hint much appreciated :)

like image 945
NiKo Avatar asked Sep 10 '11 09:09

NiKo


People also ask

How do you decode Base64 encoding in Python?

To convert a string into a Base64 character the following steps should be followed: Get the ASCII value of each character in the string. Compute the 8-bit binary equivalent of the ASCII values. Convert the 8-bit characters chunk into chunks of 6 bits by re-grouping the digits.

How do I decode Base64 content?

Decoding Files To decode a file with contents that are base64 encoded, you simply provide the path of the file with the --decode flag. As with encoding files, the output will be a very long string of the original file. You may want to output stdout directly to a file.

What is Base64 decode and encode?

Base64 is an encoding and decoding technique used to convert binary data to an American Standard for Information Interchange (ASCII) text format, and vice versa.


1 Answers

So I'm answering to myself — and sorry for that — but I think it might be useful for someone as lost as I was ;)

So you have to use ArrayBuffer and set the responseType property of your XMLHttpRequest object instance to arraybuffer for retrieving a native array of Bytes, which can be converted to base64 using the following convenient function (found there, author may be blessed here):

function base64ArrayBuffer(arrayBuffer) {   var base64    = ''   var encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'    var bytes         = new Uint8Array(arrayBuffer)   var byteLength    = bytes.byteLength   var byteRemainder = byteLength % 3   var mainLength    = byteLength - byteRemainder    var a, b, c, d   var chunk    // Main loop deals with bytes in chunks of 3   for (var i = 0; i < mainLength; i = i + 3) {     // Combine the three bytes into a single integer     chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2]      // Use bitmasks to extract 6-bit segments from the triplet     a = (chunk & 16515072) >> 18 // 16515072 = (2^6 - 1) << 18     b = (chunk & 258048)   >> 12 // 258048   = (2^6 - 1) << 12     c = (chunk & 4032)     >>  6 // 4032     = (2^6 - 1) << 6     d = chunk & 63               // 63       = 2^6 - 1      // Convert the raw binary segments to the appropriate ASCII encoding     base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d]   }    // Deal with the remaining bytes and padding   if (byteRemainder == 1) {     chunk = bytes[mainLength]      a = (chunk & 252) >> 2 // 252 = (2^6 - 1) << 2      // Set the 4 least significant bits to zero     b = (chunk & 3)   << 4 // 3   = 2^2 - 1      base64 += encodings[a] + encodings[b] + '=='   } else if (byteRemainder == 2) {     chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1]      a = (chunk & 64512) >> 10 // 64512 = (2^6 - 1) << 10     b = (chunk & 1008)  >>  4 // 1008  = (2^6 - 1) << 4      // Set the 2 least significant bits to zero     c = (chunk & 15)    <<  2 // 15    = 2^4 - 1      base64 += encodings[a] + encodings[b] + encodings[c] + '='   }    return base64 } 

So here's a working code:

var xhr = new XMLHttpRequest(); xhr.open('GET', 'http://some.tld/favicon.png', false); xhr.responseType = 'arraybuffer'; xhr.onload = function(e) {     console.log(base64ArrayBuffer(e.currentTarget.response)); }; xhr.send(); 

This will log a valid base64 encoded string representing the binary file contents.

Edit: For older browsers not having access to ArrayBuffer and having btoa() failing on encoding characters, here's another way to get a base64 encoded version of any binary:

function getBinary(file){     var xhr = new XMLHttpRequest();     xhr.open("GET", file, false);     xhr.overrideMimeType("text/plain; charset=x-user-defined");     xhr.send(null);     return xhr.responseText; }  function base64Encode(str) {     var CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";     var out = "", i = 0, len = str.length, c1, c2, c3;     while (i < len) {         c1 = str.charCodeAt(i++) & 0xff;         if (i == len) {             out += CHARS.charAt(c1 >> 2);             out += CHARS.charAt((c1 & 0x3) << 4);             out += "==";             break;         }         c2 = str.charCodeAt(i++);         if (i == len) {             out += CHARS.charAt(c1 >> 2);             out += CHARS.charAt(((c1 & 0x3)<< 4) | ((c2 & 0xF0) >> 4));             out += CHARS.charAt((c2 & 0xF) << 2);             out += "=";             break;         }         c3 = str.charCodeAt(i++);         out += CHARS.charAt(c1 >> 2);         out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));         out += CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));         out += CHARS.charAt(c3 & 0x3F);     }     return out; }  console.log(base64Encode(getBinary('http://www.google.fr/images/srpr/logo3w.png'))); 

Hope this helps others as it did for me.

like image 146
NiKo Avatar answered Sep 26 '22 06:09

NiKo