Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Python Webdriver Multithread

I'm trying to spawn multiple webdriver instances with the code from: http://www.ibm.com/developerworks/aix/library/au-threadingpython/

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup
hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
    "http://ibm.com", "http://apple.com"]
queue = Queue.Queue
out_queue = Queue.Queue

class Login_Driver(threading.Thread):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print driver.title
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            driver.get(host)
            chunk = driver.page_source()
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, driver, out_queue):
        self.out_queue = out_queue
        self.driver = driver
        print driver.name
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        time.sleep(20)
    #populate queue with data
    for host in hosts:
        queue.put(host)
    for i in range(5):
        dt = Poster(out_queue)
        dt.setDaemon(True)
        dt.start()
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)

It errors: TypeError: unbound method get() must be called with Queque instance as first argument (got nothing instead)

I'm a newbie on threads, classes, processes, can you please tell me what is more ok to use, threads or processes and if can give me an example would be great. Thank you guys.

UPDATE

Working code:

import time
import Queue
import urllib2
import threading
from selenium import webdriver
from BeautifulSoup import BeautifulSoup

hosts = ["http://yahoo.com", "http://google.com", "http://amazon.com",
        "http://ibm.com", "http://apple.com"]
queue = Queue.Queue()
out_queue = Queue.Queue()

class Login_Driver(threading.Thread):
#def __init__(self, driver):
    def __init__(self, queue, out_queue, driver):
        threading.Thread.__init__(self)
        self.queue = queue
        self.out_queue = out_queue
        self.driver = driver
        print "In init first class.."
    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()
            #grabs urls of hosts and then grabs chunk of webpage
            self.driver.get(host)
            chunk = self.driver.page_source
            #place chunk into out queue
            self.out_queue.put(chunk)
            #signals to queue job is done
            print self.driver.title
            self.queue.task_done()
class Poster(threading.Thread):
    def __init__(self, out_queue, driver):
        threading.Thread.__init__(self)
        self.out_queue = out_queue
        self.driver = driver
        print "In init a second class.."
    def run(self):
        while True:
            #grabs host from queue
            chunk = self.out_queue.get()
            #parse the chunk
            soup = BeautifulSoup(chunk)
            print soup.findAll(['title'])
            #signals to queue job is done
            print self.driver.name
            self.out_queue.task_done()
start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        driver = webdriver.Firefox()
        t = Login_Driver(queue, out_queue, driver)
        t.setDaemon(True)
        t.start()
        print "Started webdriver: --- "+str(i)+" --- from main"
    print "All started"
    time.sleep(3)
    #populate queue with data
    for host in hosts:
        queue.put(host)
        print "Opening website: "+host
    print "All sites passed for opening.."
    time.sleep(3)
    for i in range(5):
        dt = Poster(out_queue, driver)
        dt.setDaemon(True)
        dt.start()
        print "Starting second class/title and name beautifull soup and webdriver: --- "+str(i)+" --- from main"
    print "Started secound class.."
    time.sleep(3)
    #wait on the queue until everything has been processed
    queue.join()
    out_queue.join()
    print "out_queue.join()"
main()
print "Elapsed Time: %s" % (time.time() - start)
like image 368
user215379 Avatar asked Jun 03 '16 13:06

user215379


2 Answers

You are not instantiating the Queue correctly. Instead of,

queue = Queue.Queue
out_queue = Queue.Queue

it should be

queue = Queue.Queue()
out_queue = Queue.Queue()
like image 64
Vikas Ojha Avatar answered Oct 25 '22 02:10

Vikas Ojha


You need to use Queue.Queue() instead of Queue.Queue

like image 26
Pythonista Avatar answered Oct 25 '22 02:10

Pythonista