Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Nodejs download multiple files

I need to download ~26k images. The images list and urls are stored in csv file. Im reading the csv file and trying to download the images while looping through the list.

If im using small set ~1-2k it works fine but when i switch to the full set im getting EMFILE error.

Error: EMFILE, open 'S:\images_download\Images\189900008.jpg'

I've noticed that node tries to create all the files at once and this might be the issue but i'm unable to force it to create it one by one. My understanding is the code below should work like this but obviously is not.

(Just to mention that this code is executed on Windows)

Code:

var csv     = require("fast-csv");
var fs      = require('fs');
var request = require('request');
var async   = require('async');

fs.writeFile('errors.txt', '', function(){})

var downloaded = 0;
var totalImages = 0;
var files = [];

csv
 .fromPath("Device_Images_List.csv")
 .on("data", function(data){
    files.push({device: data[0], url: data[1]})
 })
 .on("end", function(){     
    totalImages = files.length;

    async.each(files, function(file, callback) {
        var deviceId = file.device;
        var deviceUrl = file.url;   

        if ( deviceId != 'DEVICE_TYPE_KEY' ) {
                try {
                    writeStream = fs.createWriteStream('./Images/' + deviceId + '.jpg');
                    proxiedRequest = request.defaults({proxy: "http://proxy:8080"});
                    proxiedRequest(deviceUrl).pipe(writeStream); 
                    writeStream.on('open', function(fd) {

                        var rem = proxiedRequest.get(deviceUrl);

                        rem.on('data', function(chunk) {        
                            writeStream.write(chunk);       
                        });
                        rem.on('end', function() {
                            downloaded++;
                            console.log('Downloaded: ' + deviceId + '; ' + (downloaded + 1) + ' of ' + totalImages);
                            writeStream.end();                      
                        });

                    });

                    writeStream.on('close', function(){
                        callback();
                    });

                } catch (ex) {
                    fs.appendFile('errors.txt', deviceId + ' failed to download', function (err) {
                        callback();
                    });
                }
        }       
    }, function(err){
        if( err ) {
          console.log(err);
        } else {

        }
    }); 
});
like image 979
Stefan Stoichev Avatar asked Jun 02 '15 04:06

Stefan Stoichev


1 Answers

As @slebetman commented the issue can be solved by using async.eachSeries to process the files one by one or async.eachLimit to limit the parallel nodes:

async.eachLimit(files, 5, function(file, callback) {
 // ... Process 5 files at the same time
}, function(err){
});
like image 72
Stefan Stoichev Avatar answered Nov 15 '22 07:11

Stefan Stoichev