Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

PyQt Class not working for the second usage

I'm using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don't have a big knowledge in Python, and even less in PyQt, so any help is very welcome.

Class borrowed from here.

from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal


class Render(QWebPage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.html = None
        signal.signal(signal.SIGINT, signal.SIG_DFL)
        self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def _finished_loading(self, result):
        self.html = self.mainFrame().toHtml()
        self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
        self.app.quit() 

###################################################################


l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]

for page in l:
    soup = Render(page).soup
    print("# soup done: " + page)

enter image description here

like image 510
Marcelo Assis Avatar asked Feb 20 '14 13:02

Marcelo Assis


1 Answers

The example crashes because the RenderPage class attempts to create a new QApplication and event-loop for every url it tries to load.

Instead, only one QApplication should be created, and the QWebPage subclass should load a new url after each page has been processed, rather than using a for-loop.

Here's a re-write of the example which should do what you want:

import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit

class WebPage(QtWebKit.QWebPage):
    def __init__(self):
        QtWebKit.QWebPage.__init__(self)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def process(self, items):
        self._items = iter(items)
        self.fetchNext()

    def fetchNext(self):
        try:
            self._url, self._func = next(self._items)
            self.mainFrame().load(QtCore.QUrl(self._url))
        except StopIteration:
            return False
        return True

    def handleLoadFinished(self):
        self._func(self._url, self.mainFrame().toHtml())
        if not self.fetchNext():
            print('# processing complete')
            QtGui.qApp.quit()


def funcA(url, html):
    print('# processing:', url)
    # soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
    # do stuff with soup...

def funcB(url, html):
    print('# processing:', url)
    # soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
    # do stuff with soup...

if __name__ == '__main__':

    items = [
        ('http://stackoverflow.com', funcA),
        ('http://google.com', funcB),
        ]

    signal.signal(signal.SIGINT, signal.SIG_DFL)
    print('Press Ctrl+C to quit\n')
    app = QtGui.QApplication(sys.argv)
    webpage = WebPage()
    webpage.process(items)
    sys.exit(app.exec_())
like image 74
ekhumoro Avatar answered Sep 26 '22 03:09

ekhumoro