I'm using Selenium's node.js API to run PhantomJS instances against a series of web pages. The code I use to execute the actions on the pages work fine, but it seems only one instance of Selenium/PhantomJS can run at a time. This function is called multiple times from the same module and steps through pages in a webshop where the pagination is handled client side (which is why I need the Selenium/PhantomJS environment - to extract data from each page).
Once again, the code in and of itself works fine, but it can't execute in parallell. What could be causing this?
module.exports = function (crawler, page, parsePage, done) {
"use strict";
var _ = require("lodash"),
format = require("util").format,
path = require("path"),
webdriver = require("selenium-webdriver"),
By = webdriver.By,
until = webdriver.until;
var phantomPath = path.resolve(__dirname, "../node_modules/.bin/phantomjs"),
isWin = process.platform === "win32";
var driver = new webdriver.Builder()
.withCapabilities({
"phantomjs.binary.path": isWin ? phantomPath + ".cmd" : phantomPath
})
.forBrowser("phantomjs")
.build();
var windowHandle = new webdriver.WebDriver.Window(driver);
windowHandle.setSize(1100, 1000);
var getAllPagesContent = function (driver) {
var pagesContent = [],
pageNo = 1;
var getNextPage = function () {
var nextPageLink;
return driver.findElements(By.css(".pagination li")).then(function (elements) {
return elements[elements.length - 1];
}).then(function (element) {
nextPageLink = element;
return element.getAttribute("class");
}).then(function (className) {
return _.includes(className, "active");
}).then(function (isLastPage) {
return (!isLastPage) ? driver.getPageSource() : false;
}).then(function (content) {
if (content)
pagesContent.push(content);
content && console.log("Got page %d", pageNo++);
return nextPageLink.findElement(By.css("a")).then(function (element) {
return element.click();
}).then(function () {
return driver.wait(until.stalenessOf(nextPageLink), 10 * 1000);
}).then(function () {
return content ? getNextPage() : pagesContent;
});
});
};
return getNextPage();
};
var processTimeout = setTimeout(function () {
console.log("PhantomJS for page %s took too long to execute", page.url);
driver.quit().then(done);
}, 60 * 1000);
driver.get(page.url).then(function () {
var pageOverlay = driver.findElement(By.css("#overlay-the-new"));
return pageOverlay.isDisplayed().then(function (visible) {
if (visible) {
pageOverlay.click();
return driver.wait(until.elementIsNotVisible(pageOverlay), 10000);
}
}).then(function () {
return getAllPagesContent(driver);
});
}).then(function (contents) {
clearTimeout(processTimeout);
console.log("Got %d pages for %s", contents.length, page.url);
_.forEach(contents, function (pageContent) {
parsePage(page.url, pageContent);
});
return driver.quit();
}).then(function () {
done();
});
}
Although PhantomJS is now deprecated you can still run it in parallel isolated Docker containers by using Selenoid. There is a ready to use image with latest release here: https://hub.docker.com/r/selenoid/phantomjs/tags/
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With