I am currently writing a web-application that involves some web-scraping. To help with this, I am employing the help of phantomjs. However, certain (but not all) web pages are returning a status="fail".
Here is the code (note: This is actually written in nodejs using the node-phantom library found here: https://github.com/alexscheelmeyer/node-phantom. While the syntax may be different, the library actually works directly with phantomjs so it shouldn't be doing anything different:
phantom.create(function (err,ph) {
ph.createPage(function (err,page) {
page.onResourceError = function(errorData) {
console.log('Unable to load resource (URL:' + errorData.url + ')');
console.log('Error code: ' + errorData.errorCode + '. Description: ' + errorData.errorString);
};
page.onLoadFinished = function(status) {
console.log('Status: ' + status);
if(status==='success') {
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js', function () {
if(fetch_results) {
//THIS IS WHERE YOU WILL DO RESULTS SHIT
console.log("results page stuff entered");
page.render('phantomjs-test2.png');
ph.exit();
} else {
page.evaluate(function () {
//page evaluate stuff
}, function(err, result) {
console.log("entering here");
page.render('phantomjs-test.png');
if(!err) fetch_results = true;
});
}
});
} else {
console.log(
"Error opening url \"" + page.reason_url
+ "\": " + page.reason
);
console.log("Connection failed.");
ph.exit();
}
}
//page.open("https://www.google.com",function (err,status) {});
page.open("https://www.pavoterservices.state.pa.us/Pages/PollingPlaceInfo.aspx",function (err,status) {});
});
}, {parameters:{'ignore-ssl-errors':'yes'}});
So for page.open with google.com, the page loads succesfully. However, with the other url listed, it returns the following error:
Unable to load resource (URL:https://www.pavoterservices.state.pa.us/Pages/PollingPlaceInfo.aspx); Error code: 2. Description: connection closed; Error opening url "undefined": undefined
Any help as to why google will load but not the url listed would be greatly appreciated!
(Note: I answered exactly the same at Issue trying to use PhantomJS to process a web page)
Try calling phantomjs with --ssl-protocol=any
I had the same exact problem, with an external site that worked one week ago.
So I searched, and found a related issue described at Qt QNetworkReply connection closed. It helped me look into the phantomjs' embedded Qt: it defaults to forcing new connections in SSLv3, which is either too new for old sites, or too old for new sites (but was quite a reasonable default at the time Qt 4.8.4 was released).
With "any", you tell phantomjs to try all protocols, which should help you pass the test. It will try more-secure-than-SSLv3 protocols, but less-secure-than-SSLv3 too (SSLv3 is at middle range). So, if "any" works, you should then try to force a more-secure-than-SSLv3 value instead of letting "any". In my case, specifying --ssl-protocol=tlsv1 worked.
Guess that the recent issues with SSL (goto fail, heartbleed, poodle, and so on) made a whole lot of websites upgrade their servers, now refusing SSLv3 connections. But in case your server uses an older-than-SSLv3 protocol, keep the "any" (and all the security risks associated…).
This will work.
var phantom = require('phantom');
phantom.create(function(ph) {
ph.createPage(function(page) {
page.open('https://www.facebook.com/login.php',
function(status) {
console.log('Opened site? %s', status);
page.render("page.png");
if (status !== 'success')
{
console.log('FAIL to load the address');
}
else
{
console.log('Success in fetching the page');
another_funny(page, ph);
ph.exit();
}
});
});
}, {parameters:{'ssl-protocol':'any'}} );
function another_funny(page, ph) {
console.log("like page");
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With