Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Promise framework for PhantomJS?

Tags:

phantomjs

I'm new to PhantomJS. I want to load a page, scrape its links, then open each of those in sequence, one at a time, perhaps even with a delay between each request. I'm having trouble getting one to fire after the other, so I thought maybe I could use promises to solve this problem, but I don't think Node libraries work with Phantom. Every example I've seen so far opens a single page, and then quits.

Here's what I've got:

var page = require('webpage').create();

page.open('http://example.com/secretpage', function(status) {
    console.log(status);
    if(status !== 'success') {
        console.log('Unable to access network');
    } else {
        var links = page.evaluate(function() {
            var nodes = [];
            var matches = document.querySelectorAll('.profile > a');
            for(var i = 0; i < matches.length; ++i) {
                nodes.push(matches[i].href);
            }
            return nodes;
        });


        links.forEach(function(link) {
            console.log(link);
            page.open(link, function(status) { // <---- tries opening every page at once
                console.log(status);

                var name = page.evaluate(function() {
                    return document.getElementById('username').innerHTML;
                });

                console.log(name);
                page.render('profiles/'+name + '.png');
            });
        });
    }
//    phantom.exit();
});

Is there a way I can open each link in sequence?

like image 467
mpen Avatar asked Feb 25 '14 04:02

mpen


3 Answers

For this typical scenario, I use async.js and especially the queue component.

Here is a very basic implementation

phantom.injectJs('async.js');

var q = async.queue(function (task, callback) {
    page.open(task.url, function(status) { // <---- tries opening every page at once
                if(status !== 'success') {
        console.log('Unable to open url > '+task.url);
    } else {
                console.log('opened '+task.url);
                //do whatever you want here ...
                    page.render(Date.now() + '.png');
                }           
                callback();
            });

}, 1);

// assign a callback
q.drain = function() {
    console.log('all urls have been processed');
    phantom.exit();
}

var page = require('webpage').create();

page.open('http://phantomjs.org/', function(status) {
    console.log(status);
    if(status !== 'success') {
        console.log('Unable to access network');
    } else {
        var links = page.evaluate(function() {
            var nodes = [];
            var matches = document.querySelectorAll('a');
            for(var i = 0; i < matches.length; ++i) {
                nodes.push(matches[i].href);
            }
            return nodes;
        });

        links.forEach(function(link) {
                q.push({url: link}, function (err) {
                    console.log('finished processing '+link);
                });
        });
    }   
});

Urls are added to the queue and will be processed in parallel (up to the concurrency limit, one here). I reuse the same page instance but that's not mandatory.

As I already did this kind of crawler in the past, let me give you two more advices :

  • Do not load images to speed up the test
  • href is sometimes relative, so check first if it's a valid url
like image 194
Cybermaxs Avatar answered Oct 18 '22 06:10

Cybermaxs


[EDIT]

You need to queue this. I have modified your code and added a simple queue mechanism in it.

var page = require('webpage').create();

page.open('http://example.com/secretpage', function(status) {
    console.log(status);
    if (status !== 'success') {
        console.log('Unable to access network');
    } else {
        var links = page.evaluate(function() {
            var nodes = [];
            var matches = document.querySelectorAll('.profile > a');
            for (var i = 0; i < matches.length; ++i) {
                nodes.push(matches[i].href);
            }
            return nodes;
        });

        var pointer = 0,
            linksCount = links.length,
            q = function() {
                var link = links[pointer];
                console.log(link);

                page.open(link, function(status) { // <---- tries opening every page at once
                    console.log(status);

                    var name = page.evaluate(function() {
                        return document.getElementById('username').innerHTML;
                    });

                    console.log(name);
                    page.render('profiles/' + name + '.png');

                    // pointer increaments;
                    pointer++;
                    if (pointer == linksCount) {
                        // recursion exit
                        phantom.exit();
                    }
                    else {
                        // recursive cal;
                        q();
                    }
                });             

            };

        // start queue to load links one by one     
        q();
});

NOTE: foreach does not wait for each page to load and page load is asynchronous. Hence your issue.

You can read an answer to similar question on CasperJS (a wrapper around PhantomJS) with code how to deal with this from How to for loop in casperjs

like image 4
sudipto Avatar answered Oct 18 '22 05:10

sudipto


You could use Phantom-promise A PhantomJS bridge with a promise based api. or phantom PhantomJS integration module for NodeJS. Other option to open each link in sequence

  1. Cybermaxs answer
  2. Use example from waitFor as suggested Cybermaxs on other SO question

Basically you got 3 options, but you may take alook Casperjs Navigation scripting & testing for PhantomJS and SlimerJS

like image 2
Adi Prasetyo Avatar answered Oct 18 '22 07:10

Adi Prasetyo