Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Making subsequent POST request in session doesn't work - web scraping

Here's what I'm trying to do: go here, then hit "search". Grab the data, then hit "next", and keep hitting next until you're out of pages. Everything up to hitting "next" works. Here's my code. The format of r.content is radically different the two times I print it, indicating that something different is happening between the GET and POST requests even though I want very similar behaviour. Why might this be occurring?

What I find weird is that even after the POST request which seems to be returning the wrong stuff, I can still parse the urls I need - just not the __EVENTVALIDATION input field.

The error message (end of the code) indicates that the content doesn't include this data that I need to make a subsequent request, but navigating to the page shows that it does have that data, and that the format is very similar to the first page.

EDIT: I'm having it open webpages based on the HTML it's parsing, and something's definitely not right. running the code below will open those pages.

The GET gets me a website with data like this:

<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="4424DBE6">
<input type="hidden" name="__VIEWSTATEENCRYPTED" id="__VIEWSTATEENCRYPTED" value="">
<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="TlIgNH

While the POST produces a site with all of that data at the bottom of the page in plaintext, like this:

|0|hiddenField|__EVENTTARGET||0|hiddenField|__EVENTARGUMENT||0|hiddenField|_

Bad r.content

Good r.content

import requests
from lxml import html
from bs4 import BeautifulSoup



page = requests.get('http://search.cpsa.ca/physiciansearch')
print('got page!')
d = {"ctl00$ctl13": "ctl00$ctl13|ctl00$MainContent$physicianSearchView$btnSearch",
     "ctl00$MainContent$physicianSearchView$txtLastName": "",
     'ctl00$MainContent$physicianSearchView$txtFirstName': "",
     'ctl00$MainContent$physicianSearchView$txtCity': "",
     "__VIEWSTATEENCRYPTED":"",
     'ctl00$MainContent$physicianSearchView$txtPostalCode': "",
     'ctl00$MainContent$physicianSearchView$rblPractice': "",
     'ctl00$MainContent$physicianSearchView$ddDiscipline': "",
     'ctl00$MainContent$physicianSearchView$rblGender': "",
     'ctl00$MainContent$physicianSearchView$txtPracticeInterests': "",
     'ctl00$MainContent$physicianSearchView$ddApprovals': "",
     'ctl00$MainContent$physicianSearchView$ddLanguage': "",
     "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
     "__EVENTARGUMENT": "",
     'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
     'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
     '__ASYNCPOST': 'true'}

h ={ "X-MicrosoftAjax":"Delta = true",
"X-Requested-With":"XMLHttpRequest",
     "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}

urls = []

with requests.session() as s:
    r = s.get("http://search.cpsa.ca/PhysicianSearch",headers=h)
    soup = BeautifulSoup(r.content, "lxml")
    tree = html.fromstring(r.content)
    html.open_in_browser(tree)

    ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
    d["__EVENTVALIDATION"] = ev
    d["__VIEWSTATEGENERATOR"] = vsg
    d["__VIEWSTATE"] = vs
    r = s.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)



    print('opening in browser')
    retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
    print(retrievedUrls)

    for url in retrievedUrls:
        urls.append(url)

    endSearch = False    
    while endSearch == False:

        tree = html.fromstring(r.content)
        html.open_in_browser(tree)


        soup = BeautifulSoup(r.content, "lxml")
        print('soup2:')
        ## BREAKS HERE
        ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
        ## BREAKS HERE, 
        vs = soup.select("#__VIEWSTATE")[0]["value"]
        vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]

        d["ctl00$ctl13"] = "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"
        d["__EVENTVALIDATION"] = ev
        d["__EVENTTARGET"] = ""
        d["__VIEWSTATEGENERATOR"] = vsg
        d["__VIEWSTATE"] = vs
        d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
        d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
        d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"] = "Next"
        r = requests.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)
        tree = html.fromstring(r.content)
        tree = html.fromstring(r.content)
        retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
        print(urls)
        print(retrievedUrls)
        endSearch = True

...

Traceback (most recent call last):
  File "C:\Users\daniel.bak\workspace\Alberta Physician Scraper\main\main.py", line 63, in <module>
    ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
IndexError: list index out of range
like image 718
Daniel Paczuski Bak Avatar asked Apr 19 '16 16:04

Daniel Paczuski Bak


1 Answers

Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION token for each post:

import requests

from bs4 import BeautifulSoup

h = {"X-MicrosoftAjax": "Delta = true",
     "X-Requested-With": "XMLHttpRequest",
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
     }
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
    'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
    'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
    '__ASYNCPOST': 'true'}

nxt_d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
    "__ASYNCPOST": "true",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}

url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
    r = s.get(url, headers=h)
    soup = BeautifulSoup(r.content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    d["__EVENTVALIDATION"] = ev
    d["__VIEWSTATE"] = vs
    r = s.post(url, data=d, headers=h)
    soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    nxt_d["__EVENTVALIDATION"] = ev
    nxt_d["__VIEWSTATE"] = vs
    r = s.post(url, data=nxt_d, headers=h)

If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.

The params:

"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"

are the page to go to and the page you are coming from so that after a get should be all that needs to change.

This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:

from lxml.html import fromstring
import requests


class Crawler(object):
    def __init__(self, ua, url):
        self.user_agent = ua
        self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
        self.post_data2 = {'__ASYNCPOST': 'true',
                           "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
        self.url = url
        self.post_data1 = { '__ASYNCPOST': 'true'}

    def populate(self, xml):
        """Pulls form post data keys and values for initial post."""
        k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
        k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
        self.post_data1[k1.get("name")] = k1.get("value")
        self.post_data1[k2.get("name")] = k2.get("value")
        self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
        self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]

    def populate2(self, xml):
        """Pulls form post data keys and values,
           for all subsequent posts,
           setting initial page number values.
        """
        data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
        self.pge = data[0]
        self.ev = data[1]
        self.post_data2["__EVENTTARGET"] = self.ev
        self.post_data2[self.ev] = "1"
        self.post_data2[self.pge] = "2"

    @staticmethod
    def put_validation(xml, d):
        """Need to request new __EVENTVALIDATION for each post.
        """
        ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
        vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
        d["__EVENTVALIDATION"] = ev
        d["__VIEWSTATE"] = vs

    def next_page(self, d):
        """Increments the page number by one per iteration."""
        e = self.post_data2[self.ev]
        v = self.post_data2[self.pge]
        self.post_data2[self.pge] = str(int(v) + 1)
        self.post_data2[self.ev] = str(int(e) + 1)

    def start(self):
        with requests.session() as s:
            # get initial page to pull __EVENTVALIDATION etc..
            req = s.get(self.url, headers={"user-agent": self.user_agent}).content
            # add __EVENTVALIDATION" to post data.
            self.put_validation(fromstring(req), self.post_data1)

            xml = fromstring(req)
            # populate the rest of the post data.
            self.populate(xml)
            resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
            # yield first page results.
            yield resp
            # fill post data for next pages.
            self.populate2(resp)
            # when this is an empty list, we will have hit the last page.
            nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
            while not nxt:
                # update  __EVENTVALIDATION token and _VIEWSTATE.
                self.put_validation(fromstring(s.get(self.url).content), self.post_data2)

                # post to get next page of results.
                yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)

                nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
                self.next_page(nxt_d)


ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
   # use tree
like image 165
Padraic Cunningham Avatar answered Oct 12 '22 12:10

Padraic Cunningham