Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

JavaScript File Hash Value Generate with Part of the file

I am working with JavaScript to generate File HASH VALUE for unique file values. Kindly check the below code for the Hash Generation Mechanism Which works good.

<script type="text/javascript">
// Reference: https://code.google.com/p/crypto-js/#MD5
function handleFileSelect(evt) 
{   
    var files = evt.target.files; // FileList object
    // Loop through the FileList and render image files as thumbnails.
    for (var i = 0, f; f = files[i]; i++) 
    {
        var reader = new FileReader();
        // Closure to capture the file information.
        reader.onload = (function(theFile) 
        {
            return function(e) 
            {
                var span = document.createElement('span');
                var test = e.target.result;                 
                //var hash = hex_md5(test);
                var hash = CryptoJS.MD5(test);
                var elem = document.getElementById("hashValue");
                elem.value = hash;
            };
        })(f);
        // Read in the image file as a data URL.
        reader.readAsBinaryString(f);
    }
}
document.getElementById('videoupload').addEventListener('change', handleFileSelect, false);
</script>

However I am facing problem when generating HASH VALUE for large files as in client side the browser Crashed.

Up-till 30MB the HASHING works well but if i try to upload larger than that the system crashes.

My Question is:

  1. Can I generate HASH Value for part of file than reading the LARGE files and getting crashes? If yes, Can I know how to do that width 'FileReader';

  2. Can I specify any amount of Byte such as 2000 Character of a file to generate HASH Value then generating for large files.

I hope the above two solution will work for larger and small files. Is there any other options?

My Fiddle Demo

like image 385
DonOfDen Avatar asked Jan 29 '15 10:01

DonOfDen


1 Answers

  1. Can I generate HASH Value for part of file than reading the LARGE files and getting crashes? If yes, Can I know how to do that width 'FileReader';

Yes, you can do that and it is called Progressive Hashing.

var md5 = CryptoJS.algo.MD5.create();

md5.update("file part 1");
md5.update("file part 2");
md5.update("file part 3");

var hash = md5.finalize();
  1. Can I specify any amount of Byte such as 2000 Character of a file to generate HASH Value then generating for large files.

There's an HTML5Rocks article on how one can use File.slice to pass a sliced file to the FileReader:

var blob = file.slice(startingByte, endindByte);
reader.readAsArrayBuffer(blob);

Full solution

I have combined both. The tricky part was to synchronize the file reading, because FileReader.readAsArrayBuffer() is asynchronous. I've written a small series function which is modeled after the series function of async.js. It has to be done one after the other, because there is is no way to get to the internal state of the hashing function of CryptoJS.

Additionally, CryptoJS doesn't understand what an ArrayBuffer is, so it has to be converted to its native data representation, which is the so-called WordArray:

function arrayBufferToWordArray(ab) {
  var i8a = new Uint8Array(ab);
  var a = [];
  for (var i = 0; i < i8a.length; i += 4) {
    a.push(i8a[i] << 24 | i8a[i + 1] << 16 | i8a[i + 2] << 8 | i8a[i + 3]);
  }
  return CryptoJS.lib.WordArray.create(a, i8a.length);
}

The other thing is that hashing is a synchronous operation where there is no yield to continue execution elsewhere. Because of this, the browser will freeze since JavaScript is single threaded. The solution is to use Web Workers to off-load the hashing to a different thread so that the UI thread keeps responsive.
Web workers expect the script file in their constructors, so I used this solution by Rob W to have an inline script.

function series(tasks, done){
    if(!tasks || tasks.length === 0) {
        done();
    } else {
        tasks[0](function(){
            series(tasks.slice(1), done);
        });
    }
}

function webWorkerOnMessage(e){
    if (e.data.type === "create") {
        md5 = CryptoJS.algo.MD5.create();
        postMessage({type: "create"});
    } else if (e.data.type === "update") {
        function arrayBufferToWordArray(ab) {
            var i8a = new Uint8Array(ab);
            var a = [];
            for (var i = 0; i < i8a.length; i += 4) {
                a.push(i8a[i] << 24 | i8a[i + 1] << 16 | i8a[i + 2] << 8 | i8a[i + 3]);
            }
            return CryptoJS.lib.WordArray.create(a, i8a.length);
        }
        md5.update(arrayBufferToWordArray(e.data.chunk));
        postMessage({type: "update"});
    } else if (e.data.type === "finish") {
        postMessage({type: "finish", hash: ""+md5.finalize()});
    }
}

// URL.createObjectURL
window.URL = window.URL || window.webkitURL;

// "Server response", used in all examples
var response = 
    "importScripts('https://cdn.rawgit.com/CryptoStore/crypto-js/3.1.2/build/rollups/md5.js');"+
    "var md5;"+
    "self.onmessage = "+webWorkerOnMessage.toString();

var blob;
try {
    blob = new Blob([response], {type: 'application/javascript'});
} catch (e) { // Backwards-compatibility
    window.BlobBuilder = window.BlobBuilder || window.WebKitBlobBuilder || window.MozBlobBuilder;
    blob = new BlobBuilder();
    blob.append(response);
    blob = blob.getBlob();
}
var worker = new Worker(URL.createObjectURL(blob));


var files = evt.target.files; // FileList object    
var chunksize = 1000000; // the chunk size doesn't make a difference
var i = 0, 
    f = files[i],
    chunks = Math.ceil(f.size / chunksize),
    chunkTasks = [],
    startTime = (new Date()).getTime();
worker.onmessage = function(e) {
    // create callback

    for(var j = 0; j < chunks; j++){
        (function(j, f){
            chunkTasks.push(function(next){
                var blob = f.slice(j * chunksize, Math.min((j+1) * chunksize, f.size));
                var reader = new FileReader();

                reader.onload = function(e) {
                    var chunk = e.target.result;
                    worker.onmessage = function(e) {
                        // update callback
                        document.getElementById('num').innerHTML = ""+(j+1)+"/"+chunks;
                        next();
                    };
                    worker.postMessage({type: "update", chunk: chunk});
                };
                reader.readAsArrayBuffer(blob);
            });
        })(j, f);
    }
    series(chunkTasks, function(){
        var elem = document.getElementById("hashValueSplit");
        var telem = document.getElementById("time");
        worker.onmessage = function(e) {
            // finish callback
            elem.value = e.data.hash;
            telem.innerHTML = "in " + Math.ceil(((new Date()).getTime() - startTime) / 1000) + " seconds";
        };
        worker.postMessage({type: "finish"});
    });

    // blocking way ahead...
    if (document.getElementById("singleHash").checked) {
        var reader = new FileReader();

        // Closure to capture the file information.
        reader.onloadend = (function(theFile) {
            function arrayBufferToWordArray(ab) {
                var i8a = new Uint8Array(ab);
                var a = [];
                for (var i = 0; i < i8a.length; i += 4) {
                    a.push(i8a[i] << 24 | i8a[i + 1] << 16 | i8a[i + 2] << 8 | i8a[i + 3]);
                }
                return CryptoJS.lib.WordArray.create(a, i8a.length);
            }
            return function(e) {
                var test = e.target.result;
                var hash = CryptoJS.MD5(arrayBufferToWordArray(test));
                //var hash = "none";
                var elem = document.getElementById("hashValue");
                elem.value = hash;
            };
        })(f);

        // Read in the image file as a data URL.
        reader.readAsArrayBuffer(f);
    }
};
worker.postMessage({type: "create"});

DEMO seems to work for big files, but it takes quite a lot of time. Maybe this can be improved using a faster MD5 implementation. It took around 23 minutes to hash a 3 GB file.

This answer of mine shows an example without webworkers for SHA-256.

like image 61
Artjom B. Avatar answered Oct 17 '22 16:10

Artjom B.