I'm new to PhantomJS. I want to load a page, scrape its links, then open each of those in sequence, one at a time, perhaps even with a delay between each request. I'm having trouble getting one to fire after the other, so I thought maybe I could use promises to solve this problem, but I don't think Node libraries work with Phantom. Every example I've seen so far opens a single page, and then quits.
Here's what I've got:
var page = require('webpage').create();
page.open('http://example.com/secretpage', function(status) {
console.log(status);
if(status !== 'success') {
console.log('Unable to access network');
} else {
var links = page.evaluate(function() {
var nodes = [];
var matches = document.querySelectorAll('.profile > a');
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
links.forEach(function(link) {
console.log(link);
page.open(link, function(status) { // <---- tries opening every page at once
console.log(status);
var name = page.evaluate(function() {
return document.getElementById('username').innerHTML;
});
console.log(name);
page.render('profiles/'+name + '.png');
});
});
}
// phantom.exit();
});
Is there a way I can open each link in sequence?
For this typical scenario, I use async.js and especially the queue component.
Here is a very basic implementation
phantom.injectJs('async.js');
var q = async.queue(function (task, callback) {
page.open(task.url, function(status) { // <---- tries opening every page at once
if(status !== 'success') {
console.log('Unable to open url > '+task.url);
} else {
console.log('opened '+task.url);
//do whatever you want here ...
page.render(Date.now() + '.png');
}
callback();
});
}, 1);
// assign a callback
q.drain = function() {
console.log('all urls have been processed');
phantom.exit();
}
var page = require('webpage').create();
page.open('http://phantomjs.org/', function(status) {
console.log(status);
if(status !== 'success') {
console.log('Unable to access network');
} else {
var links = page.evaluate(function() {
var nodes = [];
var matches = document.querySelectorAll('a');
for(var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
links.forEach(function(link) {
q.push({url: link}, function (err) {
console.log('finished processing '+link);
});
});
}
});
Urls are added to the queue and will be processed in parallel (up to the concurrency limit, one here). I reuse the same page instance but that's not mandatory.
As I already did this kind of crawler in the past, let me give you two more advices :
[EDIT]
You need to queue this. I have modified your code and added a simple queue mechanism in it.
var page = require('webpage').create();
page.open('http://example.com/secretpage', function(status) {
console.log(status);
if (status !== 'success') {
console.log('Unable to access network');
} else {
var links = page.evaluate(function() {
var nodes = [];
var matches = document.querySelectorAll('.profile > a');
for (var i = 0; i < matches.length; ++i) {
nodes.push(matches[i].href);
}
return nodes;
});
var pointer = 0,
linksCount = links.length,
q = function() {
var link = links[pointer];
console.log(link);
page.open(link, function(status) { // <---- tries opening every page at once
console.log(status);
var name = page.evaluate(function() {
return document.getElementById('username').innerHTML;
});
console.log(name);
page.render('profiles/' + name + '.png');
// pointer increaments;
pointer++;
if (pointer == linksCount) {
// recursion exit
phantom.exit();
}
else {
// recursive cal;
q();
}
});
};
// start queue to load links one by one
q();
});
NOTE: foreach does not wait for each page to load and page load is asynchronous. Hence your issue.
You can read an answer to similar question on CasperJS (a wrapper around PhantomJS) with code how to deal with this from How to for loop in casperjs
You could use Phantom-promise A PhantomJS bridge with a promise based api.
or phantom PhantomJS integration module for NodeJS
.
Other option to open each link in sequence
Basically you got 3 options, but you may take alook Casperjs Navigation scripting & testing for PhantomJS and SlimerJS
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With