Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why can't I loop through a `payload` in `requests` to iterate my web scrape?

Summary: I want to iterate through a requests payload, so that I can change the log-in ID number for each scrape.

I'm using requests & beautiful soup to do a web scrape. To log-in to the page, I need to enter a unique ID number; I have a list of such numbers, called hit_list.

For any given ID number, this script works absolutely fine. But what I want to do is automate it so that it runs through my entire hit_list

In other words, I want num in payload_1 to change for each iteration. At present num remains constant and the scrape just iterates according to the length of hit_list (i.e. in this case the same scrape would run five times)

Please note, I'm very new to coding and this is my first project. I'm aware there are likely to be problems with it and am happy to receive constructive criticism.

Importing Libraries
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl


#Recording time @ Start
startTime = datetime.now()
print(datetime.now())

#use pymysql to create database- omitted here for parsimony

#This is a sample list, in reality the list will have 100,000 + numbers.
hit_list = [100100403,100100965,100101047,100100874,100100783]

"""
This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem
wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx')
sheet= wb.get_sheet_by_name('Sheet1')
type(wb)
#LOUIS: Only importing first twenty (for trial purposes)
for id in range(1,20):
   hit_list.append(sheet.cell(row=id, column =1).value)
"""

def web_scrape():
#I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish.   
#Open page
    url = 'https://ndber.seai.ie/pass/ber/search.aspx' 

with requests.session() as r:
        r.headers.update({
    'user-agent': 'For more information on this data collection please contact **************************************'
})    

  for num in hit_list:
      #***LOCATION OF THE PROBLEM***
      payload_1 = {
                'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num, 
                'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search',
                '__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=',
            }            
            r.post(url, data=payload_1)              
#click intermediate page    
            payload_2 = {
                    '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
                    '__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=",
                    '__VIEWSTATEGENERATOR':"1F9CCB97",                
                    '__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==",
               }              
            s=r.post(url, data=payload_2)          
#scrape the page      
            soup = bs4.BeautifulSoup(s.content, 'html.parser')

"""   

FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED

"""

 print('\nBEGINNING SCRAPE....')                  
# First Section                    
            ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})            
#Address- clean scrape
            address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'})
            address = (address.get_text(',').strip())
            print('address:', address)            
#Date of Issue- clean scrape
            date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'})
            date_issue =  date_issue1.find('div', {'class':'formControlReadonly'})        
            date_issue = (date_issue.get_text().strip())
            print('date_of_issue:',date_issue)            
#MPRN -Clean scrape
            MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'})
            MPRN = MPRN1.find('div',{'class':'formControlReadonly'})
            MPRN = MPRN.get_text().strip()
            print('MPRN:', MPRN)            
#Emissions Indicator- clean scrape
            emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'})
            emissions_indicator_bunched = emissions_indicator1.get_text().strip()            
            print('\n\nem_bunched:',emissions_indicator_bunched)        
            emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split()
            print('emissions_indicator:',emissions_indicator)      
            emissions_indicator_unit= emissions_indicator_unit.replace("(","")
            emissions_indicator_unit=emissions_indicator_unit.replace(")","")
            print('emissions_indicator_unit:',emissions_indicator_unit)              

            #BER Score- clean scrape      
            BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'})
            BER_bunched =(BER_bunched.get_text().strip())
            print ('\n \nBER_bunched:', BER_bunched)                  
            BER_score, BER_actual_rating, BER_unit  = BER_bunched.split()      
            print('\nBER_score:',BER_score)
            print('\nBER_actual_rating:',BER_actual_rating)
            BER_unit = BER_unit.replace("(", " ")
            BER_unit = BER_unit.replace(")","")
            print('\nClean_BER_unit:',BER_unit )

            #Type of Rating- clean scrape
            type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'})
            type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'})
            type_of_rating = type_of_rating.get_text().strip()
            print('type_of_rating:',type_of_rating )


            # Second Section

            dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'})

            #Dwelling Type- clean scrape
            dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'})
            dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'})
            dwelling_type = dwelling_type.get_text().strip()
            print ('Dwelling Type:', dwelling_type)      

            #Number of Stories- clean scrape
            num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'})
            num_stories = num_stories1.find('div',{'class':'formControlReadonly'})
            num_stories = num_stories.get_text().strip()
            print('Number of Stories:', num_stories)

            #Year of Construction- clean scrape
            yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'})
            yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'})    
            yr_construction = yr_construction.get_text().strip()
            print('Year of Construction:', yr_construction)            

            #Floor Area- clean scrape
            floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'})
            floor_area = floor_area.get_text().strip()
            floor_area, floor_area_unit =floor_area.split()
            floor_area_unit = floor_area_unit.replace("(","")
            floor_area_unit=floor_area_unit.replace(")","")
            print('\nFloor Area:', floor_area)
            print('floor_area_unit:', floor_area_unit)

            #Wall Type- clean scrape
            wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'})
            wall_type = wall_type1.find('div',{'class':'formControlReadonly'})      
            wall_type= wall_type.get_text().strip()
            print('Wall Type:', wall_type)

            #Glazing Type- clean scrape
            glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'})
            glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'})
            glazing_type = glazing_type.get_text().strip()
            print('Glazing Type:', glazing_type)

            #Percent Low Energy Lighting- clean scrape
            percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'})
            percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'})      
            percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip()
            print('% Low Energy Lighting:', percent_low_energy_lighting)

            #Space Heating Fuel- clean scrape
            space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'})
            space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'})
            space_heating_fuel = space_heating_fuel.get_text().strip()
            print('Space Heating Fuel:',space_heating_fuel)

            #Space Heating Efficiency- clean scrape
            space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'})
            space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'})        
            space_heating_efficiency= space_heating_efficiency.get_text().strip()
            print('Space Heating Efficiency:', space_heating_efficiency)

            #Water Heatng Fuel- clean scrape
            water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'})
            water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'})
            water_heating_fuel = water_heating_fuel.get_text().strip()
            print('Water Heating Fuel:', water_heating_fuel)

            #Water Heating Efficiency- clean scrape
            water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'})
            water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'})    
            water_heating_efficiency= water_heating_efficiency.get_text().strip()
            print('Water Heating Efficiency:', water_heating_efficiency)


            #thrid section
            assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'})

            #Assessor Number- clean scrape
            assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'})
            assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'})
            assessor_num= assessor_num.get_text().strip()
            print('Assessor Number:', assessor_num)

            print('BER:', num)

            print('\***************nSCRAPE FINISHED***************\n')


            #Populate datebase      
            print('\nRECONNECTING WITH DATABASE')
            with connection.cursor() as cursor:
                print('SUCCESSFUL CONNECTION')
                sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
                cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating))
                print('ROW POPULATED')

#Calling the function
web_scrape()

#Metadata
print('Gathering Details...')
Run_time = datetime.now() - startTime
print('Run Time:', Run_time)

#Loop Finished        

print('\n***************PROGRAMME FINISHED***************')
like image 723
SeánMcK Avatar asked Jan 28 '26 20:01

SeánMcK


2 Answers

You need to get new __EVENTVALIDATION tokens etc... for each post, you cannot just copy values from your browser and hard code them into your post data:

import requests

url = 'https://ndber.seai.ie/pass/ber/search.aspx'
hit_list = [100100403, 100100965, 100101047, 100100874, 100100783]
h = {}


def renew(s):
    soup = BeautifulSoup(s.get(url).content,"html.parser.)
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
            "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
            "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}


with requests.session() as s:
    for num in hit_list:
        payload_1 = {
            'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': num,
            'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
        # update the post data with new token values
        payload_1.update(renew(s))
        r = s.post(url, data=payload_1)

        # scrape the page
        soup = BeautifulSoup(r.content, 'html.parser')

If we run the code and parse a bit of what is returned, you can see we get each page correctly:

In [8]: with requests.session() as s:
   ...:         for num in hit_list:
   ...:                 payload_1 = {
   ...:                     'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': str(num),
   ...:                     'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
   ...:                 payload_1.update(renew(s))
   ...:                 r = s.post(url, data=payload_1)
   ...:                 soup = BeautifulSoup(r.content, 'html.parser')
   ...:                 spans = soup.select("#ctl00_DefaultContent_BERSearch_gridRatings_gridview tr.GridRowStyle td span")
   ...:                 print(spans)
   ...:         
[<span>BER</span>, <span>10003467711</span>, <span>07-01-2009</span>, <span>24 CLONEE COURT\rMAIN STREET\rCLONEE\rCO. MEATH</span>]
[<span>BER</span>, <span>10301654014</span>, <span>26-11-2014</span>, <span>19 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10002082335</span>, <span>08-01-2009</span>, <span>8 CANNON PLACE\r1 HERBERT ROAD\rDUBLIN 4</span>]
[<span>BER</span>, <span>10301653940</span>, <span>18-01-2015</span>, <span>12 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10010500405</span>, <span>07-01-2009</span>, <span>13 RENMORE ROAD\rGALWAY CITY</span>]

That gives you all the info from the table bar the BER cert number, you already have that so you don't need to worry about it.

As you figured out you just need to pass the data to your second payload from what is returned from first post, if you encapsulate the logic in functions it will also make your code a bit easier to manage:

def renew(soup):
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
            "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
            "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}


def parse_data(soup):
    address = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress").text.strip()
    MPRN = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_container_MPRN div.formControlReadonly").text.strip()
    emissions_indicator, emissions_indicator_unit = soup.select_one(
        "#ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue").text.split()
    emissions_indicator_unit = emissions_indicator_unit.strip("()")
    BER_score, BER_actual_rating, BER_unit = soup.select_one(
        "#ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating").text.split()
    BER_unit = BER_unit.strip("()")
    return {"MPRN": MPRN, "emissions_indicator": emissions_indicator,
            "emissions_indicator_unit": emissions_indicator_unit,
            "BER_score": BER_score, "BER_actual_rating": BER_actual_rating,
            "BER_unit": BER_unit, "address": address}

def submint_to_db(dct):
    with connection.cursor() as cursor:
        print('SUCCESSFUL CONNECTION')
        sql = "INSERT INTO table1 ( %s ) VALUES ( %s )" % (",".join(dct),  ', '.join(['%s'] * len(dct)))
        cursor.execute(sql, dct.values())

payload_1 = {
    'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
payload_2 = {
    '__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
}

with requests.session() as s:
    tokens = renew(BeautifulSoup(requests.get(url).content, "html.parser"))
    for num in hit_list:
        # update the post data with new token values
        payload_1['ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber'] = num
        payload_1.update(tokens)
        r = s.post(url, data=payload_1)
        tokens2 = renew(BeautifulSoup(r.content, 'html.parser'))
        payload_2.update(tokens2)
        soup = BeautifulSoup(requests.post(url, data=payload_2).content, "html.parser")
        submint_to_db(parse_data(soup))

I have not parsed all the data but the logic is the same for the rest, printing the dicts returned for what is parsed will give you:

{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '57.83', 'address': '24 CLONEE COURTMAIN STREETCLONEECO. MEATH', 'BER_score': 'D1', 'BER_actual_rating': '235.54', 'MPRN': '10003467711'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '42.4', 'address': '19 GORTANORADINGLECO. KERRY', 'BER_score': 'C1', 'BER_actual_rating': '165.79', 'MPRN': '10301654014'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '34.03', 'address': '8 CANNON PLACE1 HERBERT ROADDUBLIN 4', 'BER_score': 'C2', 'BER_actual_rating': '175.32', 'MPRN': '10002082335'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '53.51', 'address': '12 GORTANORADINGLECO. KERRY', 'BER_score': 'C3', 'BER_actual_rating': '208.45', 'MPRN': '10301653940'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '121.54', 'address': '13 RENMORE ROADGALWAY CITY', 'BER_score': 'G', 'BER_actual_rating': '472.19', 'MPRN': '10010500405'}
like image 73
Padraic Cunningham Avatar answered Jan 30 '26 10:01

Padraic Cunningham


@PadraicCunningham provided most of the logic for this answer, but as my comment below his answer describes, his solution only gets me half way.
I have been able to build on his work to solve the problem.
There was just one more step to complete, which was to 'click through' an intermediary' page, which led to where the data I wanted to scrape lies.

Apologies in advance for my non-standard labelling and formatting. I'm a beginner.

import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl

hit_list = [100100403,100100965,100101047,100100874,100100783] #this is a sample list
#Open page 
url = 'https://ndber.seai.ie/pass/ber/search.aspx'


def field_update(s):
    soup = bs4.BeautifulSoup(s.get(url).content,"html.parser")
    return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
    "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")    ["value"],
    "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
    print('field updated')

with requests.session() as s:
    for ber in hit_list:
        payload_1 = {
            'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': ber,
            'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
    payload_1.update(field_update(s))
    r = s.post(url, data=payload_1)

#'click through' intermediate page
#THIS IS THE ADDITIONAL CODE THAT BUILDS ON PADRAIC'S ANSWER
    soup = bs4.BeautifulSoup(r.content,"html.parser")
    stage_two= {
        "__EVENTTARGET": 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
        "__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
        "__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
        "__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}        

    q=s.post(url, data=stage_two)
    print('payload_2 posted')    
    soup = bs4.BeautifulSoup(q.content, 'html.parser')


    print('\nBEGINNING SCRAPE....')
    #FOR DATA TO BE SCRAPED, SEE ORIGINAL QUESTION
like image 32
SeánMcK Avatar answered Jan 30 '26 12:01

SeánMcK



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!