Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Checksum of large files in nodejs

I am in the process of writing a script to make a database of my pictures. I have a script that sort of works. It goes through a folder with 5,670 files totalling 13.08 GB of data in 9 minutes and 24 seconds. Then I try it on newer, larger photos the execution seems to decrease drastically. Within 20 minutes is has only calculated the hash of three small preview files in a folder with 431 files totalling 7.58 GB.

What am I doing wrong?

var fs = require('fs')
var crypto = require('crypto')
var util = require('util')
var p = require('path')
var sqlite3 = require('sqlite3').verbose()
var db = new sqlite3.Database('./sqlite.db')
const hash_algorithm = 'sha256'

var fileCount = 0

function getFiles(directory) {
    fs.readdir(directory, function(err, files) {
        for (var i in files) {
            var filepath = directory + '/' + files[i]
            fileStat(filepath)
        }
    })
}

function fileStat(filepath) {
    fs.stat(filepath, function(err, stats) {
        if (stats.isDirectory()) {
            getFiles(filepath)
        } else {
            computeHash(filepath, hash_algorithm, function(err, hash) {
                if (err) {
                    throw err
                }
                insertStat(filepath, hash, stats.size)
            })
        }
    })
}

function computeHash(filepath, algorithm, callback) {
    var hash = crypto.createHash(algorithm)
    var rs = fs.createReadStream(filepath)

    rs.on('open', function() {})

    rs.on('error', function(err) {
        throw err
    })

    rs.on('data', function(chunk) {
        hash.update(chunk)
    })

    rs.on('end', function() {
        hash = hash.digest('hex')
        return callback(null, hash)
    })
}

function getExif(filepath, callback) {

}

function insertStat(filepath, hash, size) {
    var sql = "INSERT INTO files VALUES ($filename, $path, $hash, $size)"
    var filename = filepath.split('/')
    filename = filename[filename.length - 1]
    db.run(sql, {$filename: filename, $path: filepath, $hash: hash, $size: size})
    if (verbose) console.log('%s: %s', ++fileCount, filepath)
}

db.serialize(function() {
    db.run('CREATE TABLE files (filename text, path text, hash text, size integer)')
})

var verbose = true
var path = process.argv[2] || '.'
path = p.resolve(path)

if (verbose) console.log('path: %s', path)
getFiles(path)
like image 692
rlp Avatar asked Feb 06 '18 14:02

rlp


Video Answer


1 Answers

All your process is asynchronous. While it's good practice in javascript, you should keep control of your memory consumption:

  1. You start opening your files asynchronously with fs.stat. Which means ALL your files.

  2. Then you load them in memory using buffers, but you can't start processing them until they're completely loaded an hit the on('end',..). Which means ALL your files are competing to be FULLY loaded in your RAM.

Got it? Your memory usage is 100% and you have to hope a file got fully loaded and processed to release some memory for another one. That's what you're doing wrong.

So you need to get your memory usage back under control. Ideally, you should control how many files are processed at once. As a quick fix, I suggest you make it synchronous with fs.statSync.


Side notes

Your process also involves a database. That's the usual suspect for performance. Your code must log any db error. Here I see no potential deadlock or full scan. So no worries. Just make sure your table files is created before you start inserting.

Never use for..in to loop into an array. Use array.forEach() instead.

Please use semi-colons ; in your code. Yes, JavaScript most of the time can do without, but it will avoid you weird bugs and ease interpreter's job.

like image 129
RaphaMex Avatar answered Sep 19 '22 11:09

RaphaMex