Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Javascript - .map running out of memory

My libraries:

const Promise = require('bluebird');
const fs = Promise.promisifyAll(require('graceful-fs'));
const path = require('path');
const xml2js = Promise.promisifyAll(require('xml2js'));

I have a large number of XML files I want to parse. I am able to create an array of paths to all the files using this function:

function getFileNames(rootPath) {
  // Read content of path
  return fs.readdirAsync(rootPath)
    // Return all directories
    .then(function(content) {
      return content.filter(function(file) {
        return fs.statSync(path.join(rootPath, file)).isDirectory();
      });
    })
    // For every directory
    .map(function(directory) {
      // Save current path
      let currentPath = path.join(rootPath, directory);
      // Read files in the directory
      return fs.readdirAsync(currentPath)
        // Filter out the XMLs
        .filter(function(file) {
          return path.extname(file) === '.XML';
        })
        // Return path to file
        .map(function(file) {
          return path.join(rootPath, directory, file);
        });
    })
    // Flatten array of results
    .reduce(function(a, b) {
      return a.concat(b);
    });
}

and now I want to go trough every single file and parse it.

I have 2 function to do so:

function openFile(filePath) {
 return fs.readFileAsync('./' + filePath)
  .then(function(fileData) {
    return fileData;
  });
}

function parseFile(data) {
  return xml2js.parseStringAsync(data)
      .then(function(xmlObject) {
        return xmlObject;
      });
}

Now when I call this with the .map (the GetFileNames function outputs an array with over 20k strings with file paths) function:

getFileNames('./XML')
  .map(function(file) {
    openFile(file)
      .then(function(data) {
        parseFile(data)
            .then(function(object) {
              console.log(object);
            });
      });
  });

I get a javascript heap out of memory error:

FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory

But when I run the function a single time by passing in the path to the actual file:

openFile('./XML/2016-10-1/EUROTIPOLD2016-10-1T00-00-22.5756240530.XML')
  .then(function(data) {
    parseFile(data)
        .then(function(object) {
          console.log(object);
        });
  });

I get the desired output.

What am I doing wrong?

like image 488
Miha Šušteršič Avatar asked Jun 01 '26 08:06

Miha Šušteršič


1 Answers

Iterating nK files happens asynchronous.

1) You're getting list of files

2) by doing .map You're calling openFile, parseFile that are async functions and it takes time to read and parse.


So because of asynchronousity it proceeds to next file without waiting to finish previous one to call garbage collector to sweep memory and here is insufficient memory problem.

Think about reading 20K files with different sizes at once.


So here is solution:

Use async to synchronize (eachSeries) or control (eachLimit) iteration.

const async = require('async'); // install: npm i --save async

let files = getFileNames('./XML');

// eachLimit(files, 3,
async.eachSeries(files, 
  (file, next) => { 
    openFile(file) 
     .then(
       parseFile, 
       (err) => {
         console.error('Cannot open file:', file, err);
         next();
       }) 
     .then(
       object => { // successfully parsed file, so log it out and proceed to next file
         console.log(object);
         next();
       }, 
       (err) => {
         console.error('Cannot parse data from file:', file, err);
         next();
       });
});

p.s. feel free to comment and fix code issue in my answer.

like image 87
num8er Avatar answered Jun 02 '26 22:06

num8er



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!