Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Can't run Selenium PhantomJS instances in parallell

I'm using Selenium's node.js API to run PhantomJS instances against a series of web pages. The code I use to execute the actions on the pages work fine, but it seems only one instance of Selenium/PhantomJS can run at a time. This function is called multiple times from the same module and steps through pages in a webshop where the pagination is handled client side (which is why I need the Selenium/PhantomJS environment - to extract data from each page).

Once again, the code in and of itself works fine, but it can't execute in parallell. What could be causing this?

module.exports = function (crawler, page, parsePage, done) {
    "use strict";

    var _ = require("lodash"),
        format = require("util").format,
        path = require("path"),
        webdriver = require("selenium-webdriver"),
        By = webdriver.By,
        until = webdriver.until;

    var phantomPath = path.resolve(__dirname, "../node_modules/.bin/phantomjs"),
        isWin = process.platform === "win32";

    var driver = new webdriver.Builder()
        .withCapabilities({
            "phantomjs.binary.path": isWin ? phantomPath + ".cmd" : phantomPath
        })
        .forBrowser("phantomjs")
        .build();

    var windowHandle = new webdriver.WebDriver.Window(driver);
    windowHandle.setSize(1100, 1000);

    var getAllPagesContent = function (driver) {
        var pagesContent = [],
            pageNo = 1;

        var getNextPage = function () {
            var nextPageLink;

            return driver.findElements(By.css(".pagination li")).then(function (elements) {
                return elements[elements.length - 1];
            }).then(function (element) {
                nextPageLink = element;
                return element.getAttribute("class");
            }).then(function (className) {
                return _.includes(className, "active");
            }).then(function (isLastPage) {
                return (!isLastPage) ? driver.getPageSource() : false;
            }).then(function (content) {
                if (content)
                    pagesContent.push(content);

                content && console.log("Got page %d", pageNo++);

                return nextPageLink.findElement(By.css("a")).then(function (element) {
                    return element.click();
                }).then(function () {
                    return driver.wait(until.stalenessOf(nextPageLink), 10 * 1000);
                }).then(function () {
                    return content ? getNextPage() : pagesContent;
                });
            });
        };

        return getNextPage();
    };


    var processTimeout = setTimeout(function () {
        console.log("PhantomJS for page %s took too long to execute", page.url);
        driver.quit().then(done);
    }, 60 * 1000);

    driver.get(page.url).then(function () {
        var pageOverlay = driver.findElement(By.css("#overlay-the-new"));

        return pageOverlay.isDisplayed().then(function (visible) {
            if (visible) {
                pageOverlay.click();
                return driver.wait(until.elementIsNotVisible(pageOverlay), 10000);
            }
        }).then(function () {
            return getAllPagesContent(driver);
        });
    }).then(function (contents) {
        clearTimeout(processTimeout);
        console.log("Got %d pages for %s", contents.length, page.url);

        _.forEach(contents, function (pageContent) {
            parsePage(page.url, pageContent);
        });

        return driver.quit();
    }).then(function () {
        done();
    });
}
like image 540
fredrikekelund Avatar asked Jul 16 '15 14:07

fredrikekelund


1 Answers

Although PhantomJS is now deprecated you can still run it in parallel isolated Docker containers by using Selenoid. There is a ready to use image with latest release here: https://hub.docker.com/r/selenoid/phantomjs/tags/

like image 123
vania-pooh Avatar answered Sep 20 '22 22:09

vania-pooh