Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Scraping Javascript driven web pages with PyQt4 - how to access pages that need authentication?

I have to scrape a very, very simple page on our company's intranet in order to automate one of our internal processes (returning a function's output as successful or not).

I found the following example:

import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *

class Render(QWebPage):
  def __init__(self, url):
    self.app = QApplication(sys.argv)
    QWebPage.__init__(self)
    self.loadFinished.connect(self._loadFinished)
    self.mainFrame().load(QUrl(url))
    self.app.exec_()

  def _loadFinished(self, result):
    self.frame = self.mainFrame()
    self.app.quit()

url = 'http://sitescraper.net'
r = Render(url)
html = r.frame.toHtml()

From http://blog.sitescraper.net/2010/06/scraping-javascript-webpages-in-python.html and it's almost perfect. I just need to be able to provide authentication to view the page.

I've been looking through the documentation for PyQt4 and I'll admit a lot of it is over my head. If anyone could help, I'd appreciate it.

Edit: Unfortunately gruszczy's method didn't work for me. When I had done something similar through urllib2, I used the following code and it worked...

username = 'user'
password = 'pass'

req = urllib2.Request(url)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)

handle = urllib2.urlopen(req)
like image 807
merph Avatar asked Mar 18 '11 19:03

merph


1 Answers

I figured it out. Here's what I ended up with in case it can help someone else.

#!/usr/bin/python
# -*- coding: latin-1 -*-
import sys
import base64
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtNetwork

class Render(QWebPage):
  def __init__(self, url):
    self.app = QApplication(sys.argv)

    username = 'username'
    password = 'password'

    base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
    authheader = "Basic %s" % base64string

    headerKey = QByteArray("Authorization")
    headerValue = QByteArray(authheader)

    url = QUrl(url)
    req = QtNetwork.QNetworkRequest()
    req.setRawHeader(headerKey, headerValue)
    req.setUrl(url)

    QWebPage.__init__(self)
    self.loadFinished.connect(self._loadFinished)


    self.mainFrame().load(req)
    self.app.exec_()

  def _loadFinished(self, result):
    self.frame = self.mainFrame()
    self.app.quit()

def main():
    url = 'http://www.google.com'
    r = Render(url)
    html = r.frame.toHtml()
like image 52
merph Avatar answered Oct 23 '22 16:10

merph