Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

PhantomJS page dump script issue

Digikey has changed their website and now has a javascript that is called onload via post. This killed my former simple java HTML code retriever. I am trying to use PhantomJS to allow the execution of the javascript before saving the HTML/text.

var page = new WebPage(),
t, address;


var fs = require('fs');

if (phantom.args.length === 0) {

console.log('Usage: save.js <some URL>');
phantom.exit();
} else {

address = encodeURI(phantom.args[0]);
page.open(address, function (status) {
    if (status !== 'success') {
        console.log('FAIL to load the address');
    } else {
        f = null;
        var markup = page.content;
        console.log(markup);
        try {
        f = fs.open('htmlcode.txt', "w");
        f.write(markup);
        f.close();          
        } catch (e) {
            console.log(e);
        }
    }   
    phantom.exit();

});

}

This code works with most webpages but fails on:

http://search.digikey.com/scripts/dksearch/dksus.dll?keywords=S7072-ND

Which is my test case. It fails to open the URL and then PhantomJS crashes. Using win32 static build 1.3.

Any tips?

Basically what I am after is wget that competes the page rendering and scripts that modify the document before saving the file.

like image 695
teholabs Avatar asked Jan 01 '12 05:01

teholabs


1 Answers

a quick an dirty solution... and yet is posted on the phantomjs site... is to use a time out. I have modified your code to include a 2 second wait. this allows the page to load for 2 seconds before dumping the contents to a file. If you need the exact second or the amount of time will vary greatly this solution probably wont work for you.

var page = new WebPage(),

t, address;


var fs = require('fs');

if (phantom.args.length === 0) {

console.log('Usage: save.js <some URL>');
phantom.exit();
} else {

address = encodeURI(phantom.args[0]);
page.open(address, function (status) {
    if (status !== 'success') {
        console.log('FAIL to load the address');
    } else {
         window.setTimeout(function(){
            f = null;
            var markup = page.content;
            console.log(markup);
            try {
            f = fs.open('htmlcode.txt', "w");
            f.write(markup);
            f.close();          
            } catch (e) {
                console.log(e);
            }
        }   
        phantom.exit();
    },2000);
});

}
like image 137
George Avatar answered Sep 28 '22 09:09

George