I am trying to scrape a webpage which has a form with many dropdowns and values in the form are interdependent. At many point I need the code to wait till the refresh of the page complete. Eg after selecting an option from the list, the code should wait till the next list is populated based on this selection. It would be really helpful if someone could give pointers because strangely my code is working only after I gave so much unnecessary logging statements which in-turn created some delay. Any suggestions to improve the code would be very helpful.
var casper = require('casper').create({
verbose: true,
logLevel: 'debug',
userAgent: 'Mozilla/5.0 poi poi poi (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22',
pageSettings: {}
});
casper.start('http://www.abc.com', function () {
console.log("casper started");
this.fill('form[action="http://www.abc.com/forum/member.php"]', {
quick_username: "qwe",
quick_password: "qwe"
}, true);
this.capture('screen.png');
});
casper.thenOpen("http://www.abc.com/search/index.php").then(function () {
this.click('input[type="checkbox"][name="firstparam"]');
this.click('a#poi');
casper.evaluate(function () {
document.getElementsByName("status")[0].value = 1;
document.getElementsByName("state")[0].value = 1078;
changeState(); //This function is associated with the dropdown ie state
and the page reloads at this point. Only after complete refresh the code shoud execute! How can this be achieved?
return true;
});
this.echo('Inside the first thenOpen' + this.evaluate(function () {
return document.search.action;
}));
});
casper.then(function () {
this.capture("poi.png");
console.log('just before injecting jquery');
casper.page.injectJs('./jquery.js');
this.click('input[type="checkbox"][name="or"]');
this.evaluate(function () {
$('.boxline .filelist input:checkbox[value=18127]').attr("checked", true);
});
this.echo('Just before pressing the add college button' + this.evaluate(function () {
return document.search.action;
}));
this.capture('collegeticked.png');
if (this.exists('input[type="button"][name="niv"]')) {
this.echo('button is there');
} else {
this.echo('button is not there');
}
this.echo("Going to print return value");
this.click('input[type="button"][name="poi"]'); // This click again causes a page refresh. Code should wait at this point for completion.
this.echo('Immediately after pressing the add college btn getPresentState()' + this.evaluate(function () {
return getPresentState();
}));
this.echo('Immediately after pressing add colleg button' + this.evaluate(function () {
return document.search.action;
}));
this.capture('iu.png');
});
casper.then(function () {
console.log('just before form submit');
this.click('form[name="search"] input[type="submit"]'); //Again page refresh. Wait.
this.echo('Immediately after search btn getPresentState()' + this.evaluate(function () {
return getPresentState();
}));
this.echo('Immediately after search button-action' + this.evaluate(function () {
return document.search.action;
}));
this.capture("mnf.png");
});
casper.then(function () {
casper.page.injectJs('./jquery.js');
this.capture("resultspage.png");
this.echo('Page title is: ' + this.evaluate(function () {
return document.title;
}), 'INFO');
var a = casper.evaluate(function () {
return $('tbody tr td.tdbottom:contains("tye") ').siblings().filter($('td>a').parent());
});
console.log("ARBABU before" + a.length);
});
casper.run();
I've been using the waitForSelector 'workaround' mentioned by Arun here: https://stackoverflow.com/a/22217657/1842033
It's the best solution I've found; the 'drawback' as it were is that you need to be aware of what element you're expecting to load. I say drawback, personally I don't think I've encountered a situation where I've not had some kind of feedback saying that whatever I'm waiting for has happened
this.waitForSelector("{myElement}",
function pass () {
test.pass("Found {myElement}");
},
function fail () {
test.fail("Did not load element {myElement}");
},
20000 // timeout limit in milliseconds
);
Although I'd guess you could use waitForResource() or something like that if you didn't have visual feedback.
What I've taken to doing to get around this issue, when there isn't anything specific to target and wait for in the reloaded page, is to use the following:
var classname = 'reload-' + (new Date().getTime()),
callback = function(){},
timeout = function(){};
/// It happens when they change something...
casper.evaluate(function(classname){
document.body.className += ' ' + classname;
}, classname);
casper.thenClick('#submit'); /// <-- will trigger a reload of the page
casper.waitWhileSelector('body.' + classname, callback, timeout);
This way I don't have to rely on a specific expected element in the next page, I've basically done the inverse. I've created a specific selector to watch out for, and execution moves on once that selector fails to match.
For my intents and purposes it was enough to know the page had begun reloading, I didn't need to wait until the next page had fully reloaded. This is so that I could then trigger certain waitForSelector
calls on elements that may have existed both before and after the reload. Waiting until the temporary class has been removed lets me know that anything that existed before has since been destroyed, so no fear of selecting elements prior to the reload.
Seems there are no real solutions. http://docs.casperjs.org/en/latest/modules/casper.html#waitforselector is an available workaround which may not work always.
I have the same experience doing the same thing as you. script these way in user perspective never gone well. it crash in middle of nowhere and very unreliable. I was doing search from salesforce that also require login.
You need to keep your step as minimum as possible. script in a cron job way. don't do form fill/button click unless you are doing UI testing. I would advice you to break the process into two parts
// this part do search and find out the exact url of your screen capture.
// save it in a db/csv file
1 - start by POST to http://www.abc.com/forum/member.php with username password in body.
2 - POST/GET to http://www.abc.com/search/index.php with your search criteria, you look at what the website require. if they do POST, then POST.
// second part read your input
1 - login same as first part.
2 - casper forEach your input save your capture. (save the capture result in db/csv)
my script now is pure phantomjs, casper script just keep crashing for no reason. even phantomjs is unreliable. I save the result/status on each successful search/download, whenever there is error I exit the script if not the rest of result is unpredictable(good result in chrome turn out bad in phantomjs).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With