Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to follow all links in CasperJS?

I'm having trouble clicking all JavaScript based links in a DOM and saving the output. The links have the form

<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>

the following code works great:

var casper = require('casper').create();

var fs = require('fs');

var firstUrl = 'http://www.testurl.com/test.html';

var css_selector = '#jan_html';

casper.start(firstUrl);

casper.thenClick(css_selector, function(){
    console.log("whoop");
});

casper.waitFor(function check() {
    return this.getCurrentUrl() != firstUrl;
}, function then() {
    console.log(this.getCurrentUrl());
    var file_title = this.getTitle().split(' ').join('_') + '.html';
    fs.write(file_title, this.getPageContent());
});

casper.run();

However, how can I get this to work with a selector of "a", clicking all available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector

like image 628
user2846226 Avatar asked Nov 26 '13 18:11

user2846226


2 Answers

I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :

var casper = require('casper').create({
    logLevel:"verbose",
    debug:true
});
var links;

casper.start('http://localhost:8000');

casper.then(function getLinks(){
     links = this.evaluate(function(){
        var links = document.getElementsByTagName('a');
        links = Array.prototype.map.call(links,function(link){
            return link.getAttribute('href');
        });
        return links;
    });
});
casper.then(function(){
    this.each(links,function(self,link){
        self.thenOpen(link,function(a){
            this.echo(this.getCurrentUrl());
        });
    });
});
casper.run(function(){
    this.exit();
});
like image 116
rusln Avatar answered Sep 26 '22 02:09

rusln


rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.

I propose using the XPath generator from stijn de ryck for an element.

  1. You can then sample all XPaths that are on the page.
  2. Then you open the page for every a that you have the XPath for and click it by XPath.
  3. Wait a little if it is a single page application
  4. Do something
var startURL = 'http://localhost:8000',
    xPaths
    x = require('casper').selectXPath;

casper.start(startURL);

casper.then(function getLinks(){
    xPaths = this.evaluate(function(){
        // copied from https://stackoverflow.com/a/5178132/1816580
        function createXPathFromElement(elm) {
            var allNodes = document.getElementsByTagName('*'); 
            for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) { 
                if (elm.hasAttribute('id')) { 
                        var uniqueIdCount = 0; 
                        for (var n=0;n < allNodes.length;n++) { 
                            if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++; 
                            if (uniqueIdCount > 1) break; 
                        }; 
                        if ( uniqueIdCount == 1) { 
                            segs.unshift('id("' + elm.getAttribute('id') + '")'); 
                            return segs.join('/'); 
                        } else { 
                            segs.unshift(elm.localName.toLowerCase() + '[@id="' + elm.getAttribute('id') + '"]'); 
                        } 
                } else if (elm.hasAttribute('class')) { 
                    segs.unshift(elm.localName.toLowerCase() + '[@class="' + elm.getAttribute('class') + '"]'); 
                } else { 
                    for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) { 
                        if (sib.localName == elm.localName)  i++; }; 
                        segs.unshift(elm.localName.toLowerCase() + '[' + i + ']'); 
                }; 
            }; 
            return segs.length ? '/' + segs.join('/') : null; 
        };
        var links = document.getElementsByTagName('a');
        var xPaths = Array.prototype.map.call(links, createXPathFromElement);
        return xPaths;
    });
});
casper.then(function(){
    this.each(xPaths, function(self, xpath){
        self.thenOpen(startURL);
        self.thenClick(x(xpath));
        // waiting some time may be necessary for single page applications
        self.wait(1000);
        self.then(function(a){
            // do something meaningful here
            this.echo(this.getCurrentUrl());
        });

        // Uncomment the following line in case each click opens a new page instead of staying at the same page
        //self.back()
    });
});
casper.run(function(){
    this.exit();
});
like image 37
Artjom B. Avatar answered Sep 27 '22 02:09

Artjom B.