Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

CasperJS loop or iterate through multiple web pages?

I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop through the different subpages given this code:

var ratings = [];
var dates = [];
var casper = require('casper').create({

    pageSettings: {
        loadImages:  false,         
        loadPlugins: false          
    },
    logLevel: "debug",             
    verbose: true                   
});

var fs = require('fs');

function getRatings() {
    var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
    return Array.prototype.map.call(ratings, function(e) {
        return e.getAttribute('title');
    });
}

function getDate() {
    var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');

    return Array.prototype.map.call(dates, function(e) {

        return e.innerHTML;

    });
}

casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){

    this.echo('hi');
});

casper.then(function() {

    ratings = this.evaluate(getRatings);
    dates = this.evaluate(getDate);

    this.echo(ratings);
});


casper.run(function() {

    this.echo(ratings.length + ' ratings found:');

     for(var i=0; i<ratings.length; i++){
        ratings[i] = ratings[i]+': '+dates[i];
        dates[i] = '';
     }
    this.echo(ratings);
    var content = ratings;

    content = content.join("\n");

    fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w'); 

    this.echo(dates.length + ' dates found:').exit();



});

Any help is appreciated :)

like image 563
karansolo Avatar asked Apr 30 '14 09:04

karansolo


3 Answers

Since there exists a next page button, you can use it to traverse all pages recursively:

function getRatingsAndWrite(){
    ratings = casper.evaluate(getRatings);
    dates = casper.evaluate(getDate);

    casper.echo(ratings);
    casper.echo(ratings.length + ' ratings found:');

    for(var i=0; i<ratings.length; i++){
        ratings[i] = ratings[i]+': '+dates[i];
        dates[i] = '';
    }
    casper.echo(ratings);
    var content = ratings;

    content = content.join("\n");

    fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a'); 

    casper.echo(dates.length + ' dates found:');

    var nextLink = ".BVRRPageLink.BVRRNextPage > a";
    if (casper.visible(nextLink)) {
        casper.thenClick(nextLink);
        casper.then(getRatingsAndWrite);
    } else {
        casper.echo("END")
    }
}

casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');

casper.then(getRatingsAndWrite);

casper.run();

A related answer is A: CasperJS parse next page after button click.

like image 90
Artjom B. Avatar answered Oct 18 '22 18:10

Artjom B.


This code can help you : you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.

You can use a click method in the loop instead of url too.

var navigation = [
    {
        url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', 
        selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
    }
    ,{
        url: 'yourSecondUrl, etc...',
        selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
        selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
    }
],
content = "";

    casper.start()
    .then(function(){
        //loop on the array
        navigation.forEach(function(navIndex){
            //open url : property url 
            casper.thenOpen(navIndex.url)
            //wait for the page to load -> must be useless because thenOpen() do it
            .waitForUrl(navIndex.url, function(){
                //get the value of attribute title of adequate selector
                var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
                //get the HTML of adequate selector
                var dates = this.getHTML(navIndex.selectorDates);
                this.echo(ratings);
                this.echo(dates);
                content = content +  ' ' + ratings + ' ' + dates;
            }); 
        });
    })
    .run(function() {
            this.echo('----------- All steps done ------------\n');
            this.exit();
    });
like image 21
Fanch Avatar answered Oct 18 '22 19:10

Fanch


Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:

var ratings = [];
var dates = [];
var casper = require('casper').create({

    pageSettings: {
        loadImages:  false,         
        loadPlugins: false          
    },
    logLevel: "debug",               
    verbose: true                   
});

var fs = require('fs');

function getRatings() {
    var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
    return Array.prototype.map.call(ratings, function(e) {
        return e.getAttribute('title');
    });
}

function getDate() {
    var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');

    return Array.prototype.map.call(dates, function(e) {

        return e.innerHTML;

    });
}

function getRatingsAndWrite(){
    ratings = casper.evaluate(getRatings);
    dates = casper.evaluate(getDate);


    casper.echo(ratings.length + ' ratings found:');

     for(var i=0; i<ratings.length; i++){
        var rating = ratings[i].substr(0,1);
        ratings[i] = rating +': '+dates[i];
        dates[i] = '';
    } 

    var content = ratings;

    content = content.join("\n");

    fs.write("<filepath to write content>", content, 'a'); 

    casper.echo(dates.length + ' dates found:');

    var nextLink = ".BVRRPageLink.BVRRNextPage > a";
    if (casper.visible(nextLink)) {
        casper.thenClick(nextLink);
        casper.wait(3000);
        casper.then(getRatingsAndWrite);
    } else {
        casper.echo("END")
    }
}

casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');

casper.then(getRatingsAndWrite);

casper.run();
like image 2
karansolo Avatar answered Oct 18 '22 18:10

karansolo