Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Memory leak in Node.js scraper

This is a simple scraper written in JavaScript with Node.js, for scraping Wikipedia for periodic table element data. The dependencies are jsdom for DOM manipulation and chain-gang for queuing.

It works fine, most of the time (it doesn't handle errors gracefully), and the code isn't too bad, dare I say for a for attempt, but there is a serious fault with it - it leaks memory horribly, anywhere from 0.3% to 0.6% of the computer's memory for each element, such that by the time it gets to lead it would be using somewhere close to 20%, which is plainly unacceptable.

I've tried working with profilers, but I have either not found them to be helpful or have difficulty interpreting the data. I suspect it has something to do with the way processElement gets passed around, but I have difficulty in rewriting the queue code into something more elegant.

var fs = require('fs'),
    path = require('path'),
    jsdom = require("jsdom"),
    parseUrl = require('url').parse,
    chainGang = require('chain-gang');

var chain = chainGang.create({
    workers: 1
});

var Settings = {
    periodicUrl: 'http://en.wikipedia.org/wiki/Template:Periodic_table',
    periodicSelector: '#bodyContent > table:first',
    pathPrefix: 'data/',
    ignoredProperties: ['Pronunciation']
};

function writeToFile(output) {
    var keys = 0;

    // Huge nests for finding the name of the element... yeah
    for(var i in output) {
        if(typeof output[i] === 'object' && output[i] !== null){
            for(var l in output[i]) {
                if(l.toLowerCase() === 'name') {
                    var name = output[i][l];
                }
            }

            keys += Object.keys(output[i]).length;
        }
    }

    console.log('Scraped ' + keys + ' properties for ' + name);
    console.log('Writing to ' + Settings.pathPrefix + name + '.json');
    fs.writeFile(Settings.pathPrefix + name + '.json', JSON.stringify(output));
}

// Generic create task function to create a task function that
// would be passed to the chain gang
function createTask (url, callback) {
    console.log('Task added - ' + url);

    return function(worker){
        console.log('Requesting: ' +url);

        jsdom.env(url, [
            'jquery.min.js' // Local copy of jQuery
        ], function(errors, window) {
            if(errors){
                console.log('Error! ' + errors)
                createTask(url, callback);
            } else {
                // Give me thy $
                var $ = window.$;

                // Cleanup - remove unneeded elements
                $.fn.cleanup = function() {
                    return this.each(function(){
                        $(this).find('sup.reference, .IPA').remove().end()
                            .find('a, b, i, small, span').replaceWith(function(){
                                return this.innerHTML;
                            }).end()
                            .find('br').replaceWith(' ');
                    });
                }

                callback($);
            }

            worker.finish();
        });
    }
}

function processElement ($){
    var infoBox = $('.infobox'),
        image = infoBox.find('tr:contains("Appearance") + tr img:first'),
        description = $('#toc').prevAll('p').cleanup(),
        headers = infoBox.find('tr:contains("properties")'),
        output = {
            Appearance: image.attr('src'),
            Description: $('.infobox + p').cleanup().html()
        };

    headers.each(function(){
        var that = this,
            title = this.textContent.trim(),
            rowspan = 0,
            rowspanHeading = '';

        output[title] = {};

        $(this).nextUntil('tr:has(th:only-child)').each(function(){
            var t = $(this).cleanup(),
                headingEle = t.children('th'),
                data = t.children('td').html().trim();

            if(headingEle.length) {
                var heading = headingEle.html().trim();
            }

            // Skip to next heading if current property is ignored
            if(~Settings.ignoredProperties.indexOf(heading)) {
                return true;
            }

            if (rowspan) {
                output[title][rowspanHeading][data.split(':')[0].trim()] = data.split(':')[1].trim();
                rowspan--;
            } else if (headingEle.attr('rowspan')){
                rowspan = headingEle.attr('rowspan') - 1;
                rowspanHeading = heading;

                output[title][heading] = {};
                output[title][heading][data.split(':')[0]] = data.split(':')[1];
            } else if (~heading.indexOf(',')){
                data = data.split(',');

                heading.split(',').forEach(function(v, i){
                    output[title][v.trim()] = data[i].trim();
                });
            } else {
                output[title][heading] = data;
            }
        });
    });

    writeToFile(output);
}

function fetchElements(elements) {
    elements.forEach(function(value){
        // Element URL used here as task id (second argument)
        chain.add(createTask(value, processElement), value);
    });
}

function processTable($){
    var elementArray = $(Settings.periodicSelector).find('td').map(function(){
        var t = $(this),
            atomicN = parseInt(t.text(), 10);

        if(atomicN && t.children('a').length) {
            var elementUrl = 'http://' + parseUrl(Settings.periodicUrl).host + t.children('a:first').attr('href');

            console.log(atomicN, t.children('a:first').attr('href').split('/').pop(), elementUrl);
            return elementUrl;
        }
    }).get();

    fetchElements(elementArray);
    fs.writeFile(Settings.pathPrefix + 'elements.json', JSON.stringify(elementArray));
}

// Get table - init
function getPeriodicList(){
    var elementsList = Settings.pathPrefix + 'elements.json';

    if(path.existsSync(elementsList)){
        var fileData = JSON.parse(fs.readFileSync(elementsList, 'utf8'));
        fetchElements(fileData);
    } else {
        chain.add(createTask(Settings.periodicUrl, processTable));
    }
}

getPeriodicList();
like image 924
Yi Jiang Avatar asked Apr 19 '11 14:04

Yi Jiang


People also ask

How do I resolve a memory leak issue in Node JS?

A quick way to fix Node. js memory leaks in the short term is to restart the app. Make sure to do this first and then dedicate the time to seek out the root cause of the memory leak.

How do I see memory leaks in Node JS application?

Chrome DevTools is a great tool that can be used to diagnose memory leaks in Node. js applications via remote debugging. Other tools exist and they will give you the similar. This blog post relies on one of those different tools in order to give you a clear clear understanding of what is happening.

Is Node JS good for scraping?

Web scraping is the process of extracting data from a website in an automated way and Node. js can be used for web scraping. Even though other languages and frameworks are more popular for web scraping, Node. js can be utilized well to do the job too.


1 Answers

jsdom does have a memory leak which stems from the copy in and copy out logic behind node's vm.runInContext(). There has been effort to fix this problem using c++ and we are hoping to prove out the solution before attempting to push it into node.

A workaround for now is to spawn up a child process for each dom and close it down when you are done.

EDIT:

as of jsdom 0.2.3 this issue is fixed as long as you close the window (window.close()) when you are done with it.

like image 87
tmpvar Avatar answered Oct 02 '22 09:10

tmpvar