How can I "render" HTML with with PyQt5 v5.6 QWebEngineView?
I have previously performed the task with PyQt5 v5.4.1 QWebPage, but it was suggested to try the newer QWebEngineView.
Here's that implementation (it generally works as expected, but has a tendency to hang indefinitely for some sites and situations):
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebKitWidgets import QWebPage
class Render(QWebPage):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
self.html = self.mainFrame().toHtml()
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
What follows is my attempt at using QWebEngineView. First, the installation and setup of PyQt5 v5.6 on Ubuntu:
# install PyQt5 v5.6 wheel from PyPI
pip3 install --user pyqt5
# link missing resources
ln -s ../resources/icudtl.dat ../resources/qtwebengine_resources.pak ../resources/qtwebengine_resources_100p.pak ../resources/qtwebengine_resources_200p.pak ../translations/qtwebengine_locales ~/.local/lib/python3.5/site-packages/PyQt5/Qt/libexec/
Now for the Python... The following results in a segmentation fault:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# what's going on here? how can I get the HTML from toHtml?
self.page().toHtml(self.callable)
self.app.quit()
def callable(self, data):
self.html = data
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The trouble appears to lie in the call to asynchronous toHtml()
. It seems like it should be fairly simple, but I'm at a loss with what to do with it. I see it's been discussed in the context of C++, but I'm not sure how to translate this to Python. How can I get the HTML out?
Quite a bit of discussion on the topic was made in the following thread: https://riverbankcomputing.com/pipermail/pyqt/2015-January/035324.html
The new QWebEngine interface takes account of the fact that the underlying Chromium engine is asynchronous. As such we have to turn an asynchronous API into a synchronous one.
Here's how that looks:
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
The answer by Six & Veehmot is great, but I found out that for my purpose it was not sufficient, as it did not expand the dropdown elements of the page that I wanted to scrape. A slight modification fixed this:
def render(url):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtCore import QEventLoop,QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
print(render(dummy_url))
As you pointed out, Qt5.4 relies on async calls. It's not necessary to use the Loop (as seen on your answer), since your only mistake was to call quit
before the toHtml
call finishes.
def render(source_html):
"""Fully render HTML, JavaScript and all."""
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
self.app.exec_()
def _loadFinished(self, result):
# This is an async call, you need to wait for this
# to be called before closing the app
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
# Data has been stored, it's safe to quit the app
self.app.quit()
return Render(source_html).html
import requests
sample_html = requests.get(dummy_url).text
print(render(sample_html))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With