Here's what I'm trying to do: go here, then hit "search". Grab the data, then hit "next", and keep hitting next until you're out of pages. Everything up to hitting "next" works. Here's my code. The format of r.content is radically different the two times I print it, indicating that something different is happening between the GET and POST requests even though I want very similar behaviour. Why might this be occurring?
What I find weird is that even after the POST request which seems to be returning the wrong stuff, I can still parse the urls I need - just not the __EVENTVALIDATION input field.
The error message (end of the code) indicates that the content doesn't include this data that I need to make a subsequent request, but navigating to the page shows that it does have that data, and that the format is very similar to the first page.
EDIT: I'm having it open webpages based on the HTML it's parsing, and something's definitely not right. running the code below will open those pages.
The GET gets me a website with data like this:
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="4424DBE6">
<input type="hidden" name="__VIEWSTATEENCRYPTED" id="__VIEWSTATEENCRYPTED" value="">
<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="TlIgNH
While the POST produces a site with all of that data at the bottom of the page in plaintext, like this:
|0|hiddenField|__EVENTTARGET||0|hiddenField|__EVENTARGUMENT||0|hiddenField|_
Bad r.content
Good r.content
import requests
from lxml import html
from bs4 import BeautifulSoup
page = requests.get('http://search.cpsa.ca/physiciansearch')
print('got page!')
d = {"ctl00$ctl13": "ctl00$ctl13|ctl00$MainContent$physicianSearchView$btnSearch",
"ctl00$MainContent$physicianSearchView$txtLastName": "",
'ctl00$MainContent$physicianSearchView$txtFirstName': "",
'ctl00$MainContent$physicianSearchView$txtCity': "",
"__VIEWSTATEENCRYPTED":"",
'ctl00$MainContent$physicianSearchView$txtPostalCode': "",
'ctl00$MainContent$physicianSearchView$rblPractice': "",
'ctl00$MainContent$physicianSearchView$ddDiscipline': "",
'ctl00$MainContent$physicianSearchView$rblGender': "",
'ctl00$MainContent$physicianSearchView$txtPracticeInterests': "",
'ctl00$MainContent$physicianSearchView$ddApprovals': "",
'ctl00$MainContent$physicianSearchView$ddLanguage': "",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTARGUMENT": "",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
h ={ "X-MicrosoftAjax":"Delta = true",
"X-Requested-With":"XMLHttpRequest",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
urls = []
with requests.session() as s:
r = s.get("http://search.cpsa.ca/PhysicianSearch",headers=h)
soup = BeautifulSoup(r.content, "lxml")
tree = html.fromstring(r.content)
html.open_in_browser(tree)
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATEGENERATOR"] = vsg
d["__VIEWSTATE"] = vs
r = s.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)
print('opening in browser')
retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
print(retrievedUrls)
for url in retrievedUrls:
urls.append(url)
endSearch = False
while endSearch == False:
tree = html.fromstring(r.content)
html.open_in_browser(tree)
soup = BeautifulSoup(r.content, "lxml")
print('soup2:')
## BREAKS HERE
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
## BREAKS HERE,
vs = soup.select("#__VIEWSTATE")[0]["value"]
vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
d["ctl00$ctl13"] = "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"
d["__EVENTVALIDATION"] = ev
d["__EVENTTARGET"] = ""
d["__VIEWSTATEGENERATOR"] = vsg
d["__VIEWSTATE"] = vs
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"] = "Next"
r = requests.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)
tree = html.fromstring(r.content)
tree = html.fromstring(r.content)
retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
print(urls)
print(retrievedUrls)
endSearch = True
...
Traceback (most recent call last):
File "C:\Users\daniel.bak\workspace\Alberta Physician Scraper\main\main.py", line 63, in <module>
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
IndexError: list index out of range
Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION
token for each post:
import requests
from bs4 import BeautifulSoup
h = {"X-MicrosoftAjax": "Delta = true",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
nxt_d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
"__ASYNCPOST": "true",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
r = s.get(url, headers=h)
soup = BeautifulSoup(r.content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
r = s.post(url, data=d, headers=h)
soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
nxt_d["__EVENTVALIDATION"] = ev
nxt_d["__VIEWSTATE"] = vs
r = s.post(url, data=nxt_d, headers=h)
If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.
The params:
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"
are the page to go to and the page you are coming from so that after a get should be all that needs to change.
This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:
from lxml.html import fromstring
import requests
class Crawler(object):
def __init__(self, ua, url):
self.user_agent = ua
self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
self.post_data2 = {'__ASYNCPOST': 'true',
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
self.url = url
self.post_data1 = { '__ASYNCPOST': 'true'}
def populate(self, xml):
"""Pulls form post data keys and values for initial post."""
k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
self.post_data1[k1.get("name")] = k1.get("value")
self.post_data1[k2.get("name")] = k2.get("value")
self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]
def populate2(self, xml):
"""Pulls form post data keys and values,
for all subsequent posts,
setting initial page number values.
"""
data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
self.pge = data[0]
self.ev = data[1]
self.post_data2["__EVENTTARGET"] = self.ev
self.post_data2[self.ev] = "1"
self.post_data2[self.pge] = "2"
@staticmethod
def put_validation(xml, d):
"""Need to request new __EVENTVALIDATION for each post.
"""
ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
def next_page(self, d):
"""Increments the page number by one per iteration."""
e = self.post_data2[self.ev]
v = self.post_data2[self.pge]
self.post_data2[self.pge] = str(int(v) + 1)
self.post_data2[self.ev] = str(int(e) + 1)
def start(self):
with requests.session() as s:
# get initial page to pull __EVENTVALIDATION etc..
req = s.get(self.url, headers={"user-agent": self.user_agent}).content
# add __EVENTVALIDATION" to post data.
self.put_validation(fromstring(req), self.post_data1)
xml = fromstring(req)
# populate the rest of the post data.
self.populate(xml)
resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
# yield first page results.
yield resp
# fill post data for next pages.
self.populate2(resp)
# when this is an empty list, we will have hit the last page.
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
while not nxt:
# update __EVENTVALIDATION token and _VIEWSTATE.
self.put_validation(fromstring(s.get(self.url).content), self.post_data2)
# post to get next page of results.
yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
self.next_page(nxt_d)
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
# use tree
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With