Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Async parallel requests are running sequentially

I am running a server using Node.js and need to request data from another server that I am running (localhost:3001). I need to make many requests (~200) to the data server and collect the data (response sizes vary from ~20Kb to ~20Mb). Each request is independent, and I would like to save the responses as one giant array of the form:

[{"urlAAA": responseAAA}, {"urlCCC": responseCCC}, {"urlBBB": responseBBB}, etc ]

Notice that the order of the items in unimportant, they should ideally fill the array in the order that the data becomes available.

var express = require('express');
var router = express.Router();
var async = require("async");
var papa = require("papaparse");
var sync_request = require('sync-request');
var request = require("request");

var pinnacle_data = {};
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    lookup_list.push(i);
}

function write_delayed_files(object, key, value) {
    object[key] = value;
    return;
}

var show_file = function (file_number) {
    var file_index = Math.round(Math.random() * 495) + 1;
    var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index.toString();
    var response_json = sync_request('GET', pinnacle_file_index);
    var pinnacle_json = JSON.parse(response_json.getBody('utf8'));
    var object_key = "file_" + file_number.toString();
    pinnacle_data[object_key] = pinnacle_json;
    console.log("We've handled file:    " + file_number);
    return;
};

async.each(lookup_list, show_file, function (err) {});



console.log(pinnacle_data);

/* GET contact us page. */
router.get('/', function (req, res, next) {
    res.render('predictionsWtaLinks', {title: 'Async Trial'});
});

module.exports = router;

Now when this program is run it displays:

We've handled file:    0
We've handled file:    1
We've handled file:    2
We've handled file:    3
We've handled file:    4
We've handled file:    5
etc

Now as the files are of such variable size I was expecting that this would perform the requests "in parallel", but it seems to perform them sequentially, which is what I was trying to avoid through using async.each(). Currently it takes about 1-2s to connect to the data server and so to perform this over many files is taking too long.

I realise I am using synchronous requesting, and so would like to ideally replace:

var response_json = sync_request('GET', pinnacle_file_index);

with something similar to

request(pinnacle_file_index, function (error, response, body) {
    if (!error && response.statusCode == 200) {
        pinnacle_data[object_key] = JSON.parse(body);
    }
});

Any help would be much appreciated.

Additionally I have looked at trying:

  • Converting the list of urls into a list of anonymous functions and using async.parallel(function_list, function (err, results) { //add results to pinnacle_data[]});. (I have encountered problems trying to define unique functions for each element in the array).

Similarly I have looked at other related topics:

  • I have tried to mimic suggested solutions from Asynchronous http calls with nodeJS with no progress.

  • Node.js - Async.js: how does parallel execution work?.

  • How to do parallel async multiple requests at once with Promises in Node

EDIT - WORKING SOLUTION


The following code now does the task (taking ~80ms per request, including having to make repeated requests using npm requestretry). Similarly this scales very well, taking an average request time of ~80ms for making between 5 request in total, up to 1000.

var performance = require("performance-now");
var time_start = performance();
var async = require("async");
var request_retry = require('requestretry');

var lookup_list = [];
var total_requests = 50;
for (var i = 0; i < total_requests; i++) {
    lookup_list.push(i);
}

var pinnacle_data = {};
async.map(lookup_list, function (item, callback) {
        var file_index = Math.round(Math.random() * 495) + 1;
        var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index;
        request_retry({
                url: pinnacle_file_index,
                maxAttempts: 20,
                retryDelay: 20,
                retryStrategy: request_retry.RetryStrategies.HTTPOrNetworkError
            },
            function (error, response, body) {
                if (!error && response.statusCode == 200) {
                    body = JSON.parse(body);
                    var data_array = {};
                    data_array[file_index.toString()] = body;
                    callback(null, data_array);
                } else {
                    console.log(error);
                    callback(error || response.statusCode);
                }
            });
    },
    function (err, results) {
        var time_finish = performance();
        console.log("It took " + (time_finish - time_start).toFixed(3) + "ms to complete " + total_requests + " requests.");
        console.log("This gives an average rate of " + ((time_finish - time_start) / total_requests).toFixed(3) + " ms/request");
        if (!err) {
            for (var i = 0; i < results.length; i++) {
                for (key in results[i]) {
                    pinnacle_data[key] = results[i][key];
                }
            }
            var length_array = Object.keys(pinnacle_data).length.toString();
            console.log("We've got all the data, totalling " + length_array + " unique entries.");
        } else {
            console.log("We had an error somewhere.");
        }
    });

Thanks for the help.

like image 766
oliversm Avatar asked Jan 07 '23 15:01

oliversm


2 Answers

As you have discovered, async.parallel() can only parallelize operations that are themselves asynchronous. If the operations are synchronous, then because of the single threaded nature of node.js, the operations will run one after another, not in parallel. But, if the operations are themselves asynchronous, then async.parallel() (or other async methods) will start them all at once and coordinate the results for you.

Here's a general idea using async.map(). I used async.map() because the idea there is that it takes an array as input and produces an array of results in the same order as the original, but runs all the requests in parallel which seems to line up with what you want:

var async = require("async");
var request = require("request");

// create list of URLs
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    var index = Math.round(Math.random() * 495) + 1;
    var url = 'http://localhost:3001/generate?file=' + index;
    lookup_list.push(url);
}

async.map(lookup_list, function(url, callback) {
    // iterator function
    request(url, function (error, response, body) {
        if (!error && response.statusCode == 200) {
            var body = JSON.parse(body);
            // do any further processing of the data here
            callback(null, body);
        } else {
            callback(error || response.statusCode);
        }
    });
}, function(err, results) {
    // completion function
    if (!err) {
        // process all results in the array here
        console.log(results);
        for (var i = 0; i < results.length; i++) {
            // do something with results[i]
        }
    } else {
        // handle error here
    }
});

And, here's a version using Bluebird promises and somewhat similarly using Promise.map() to iterate the initial array:

var Promise = require("bluebird");
var request = Promise.promisifyAll(require("request"), {multiArgs: true});

// create list of URLs
var lookup_list = [];
for (var i = 0; i < 20; i++) {
    var index = Math.round(Math.random() * 495) + 1;
    var url = 'http://localhost:3001/generate?file=' + index;
    lookup_list.push(url);
}

Promise.map(lookup_list, function(url) {
    return request.getAsync(url).spread(function(response, body) {
        if response.statusCode !== 200) {
            throw response.statusCode;
        }
        return JSON.parse(body);
    });
}).then(function(results) {
    console.log(results);
    for (var i = 0; i < results.length; i++) {
        // process results[i] here
    }
}, function(err) {
    // process error here
});
like image 86
jfriend00 Avatar answered Jan 18 '23 18:01

jfriend00


Sounds like you're just trying to download a bunch of URLs in parallel. This will do that:

var request = require('request');
var async = require('async');

var urls = ['http://microsoft.com', 'http://yahoo.com', 'http://google.com', 'http://amazon.com'];

var loaders = urls.map( function(url) {
  return function(callback) {
        request(url, callback);
  }
});

async.parallel(loaders, function(err, results) {
        if (err) throw(err); // ... handle appropriately
        // results will be an array of the results, in 
        // the same order as 'urls', even thought the operation
        // was done in parallel
        console.log(results.length); // == urls.length
});

or even simpler, using async.map:

var request = require('request');
var async = require('async');

var urls = ['http://microsoft.com', 'http://yahoo.com', 'http://google.com', 'http://amazon.com'];

async.map(urls, request, function(err, results) {
        if (err) throw(err);          // handle error 
        console.log(results.length);  // == urls.length
});
like image 26
caasjj Avatar answered Jan 18 '23 17:01

caasjj