So I'm making a little scraper for learning purposes, in the end I should get a tree-like structure of the pages on the website.
I've been banging my head trying to get the requests right. This is more or less what I have:
var request = require('request');
function scanPage(url) {
// request the page at given url:
request.get(url, function(err, res, body) {
var pageObject = {};
/* [... Jquery mumbo-jumbo to
1. Fill the page object with information and
2. Get the links on that page and store them into arrayOfLinks
*/
var arrayOfLinks = ['url1', 'url2', 'url3'];
for (var i = 0; i < arrayOfLinks.length; i++) {
pageObj[arrayOfLinks[i]] = scanPage[arrayOfLinks[i]];
}
});
return pageObj;
}
I know this code is wrong on many levels, but it should give you an idea of what I'm trying to do.
How should I modify it to make it work? (without the use of promises if possible)
(You can assume that the website has a tree-like structure, so every page only has links to pages further down the three, hence the recursive approach)
I know that you'd rather not use promises for whatever reason (and I can't ask why in the comments because I'm new), but I believe that promises are the best way to achieve this.
Here's a solution using promises that answers your question, but might not be exactly what you need:
var request = require('request');
var Promise = require('bluebird');
var get = Promise.promisify(request.get);
var maxConnections = 1; // maximum number of concurrent connections
function scanPage(url) {
// request the page at given url:
return get(url).then((res) => {
var body = res.body;
/* [... Jquery mumbo-jumbo to
1. Fill the page object with information and
2. Get the links on that page and store them into arrayOfLinks
*/
var arrayOfLinks = ['url1', 'url2', 'url3'];
return Promise.map(arrayOfLinks, scanPage, { concurrency: maxConnections })
.then(results => {
var res = {};
for (var i = 0; i < results.length; i++)
res[arrayOfLinks[i]] = results[i];
return res;
});
});
}
scanPage("http://example.com/").then((res) => {
// do whatever with res
});
Edit: Thanks to Bergi's comment, rewrote the code to avoid the Promise constructor antipattern.
Edit: Rewrote in a much better way. By using Bluebird's concurrency
option, you can easily limit the number of simultaneous connections.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With