Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How can i extract the links from the site that contains pagination?(using selenium)

I want to extract the links from the following site but it does include pagination: I want to extract link under the MoreInfo Button:

I'm using the following snippet:

import time
import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import re


browser = webdriver.Chrome()
time.sleep(5)
browser.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(browser,15)

def extract_data(browser):
    links = browser.find_elements_by_xpath("//div[@class='seeMoreBtn']/a")
    return [link.get_attribute('href') for link in links]


element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@class='glyphicon glyphicon-chevron-right']")))
max_pages = int(re.search(r'\d+ de (\d+)', element.text).group(1), re.UNICODE)
# extract from the current (1) page
print("Page 1")
print(extract_data(browser))

for page in range(2, max_pages + 1):
    print("Page %d" % page)
    next_page = browser.find_element_by_xpath("//a[@class='glyphicon glyphicon-chevron-right']").click()
    print(extract_data(browser))
    print("-----")

when i run the above script i got this error**(I'm not too much familiar with regex as well just exploring the concept)**:

Traceback (most recent call last):
  File "E:/Python/CSV/testingtesting.py", line 29, in <module>
    max_pages = int(re.search(r'\d+ de (\d+)', element.text).group(1), re.UNICODE)
AttributeError: 'NoneType' object has no attribute 'group'

Please suggest me the solution if possible. Somehow i manage to extract the link using wait and clicking on pagination link. But its time taking as have added almost 13seconds of wait in it and that working code is as follows:

import time
import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import re



# ----------------------------------------------HANDLING-SELENIUM-STUFF-------------------------------------------------
linkList = []
driver = webdriver.Chrome()
time.sleep(5)
driver.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(driver,8)
time.sleep(7)

for i in range(1,2925):
    time.sleep(3)
    # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "//div[@class='seeMoreBtn']/a")))
    links = driver.find_elements_by_xpath("//div[@class='seeMoreBtn']/a")
    # print(links.text)
    time.sleep(3)

    #appending extracted links to the list
    for link in links:
        value=link.get_attribute("href")
        # linkList.append(value)
        with open('test.csv','a',encoding='utf-8',newline='') as fp:
            writer = csv.writer(fp, delimiter=',')
            writer.writerow([value])
    # print(i,"  ",)
    time.sleep(1)
    driver.find_element_by_xpath("//a[@class='glyphicon glyphicon-chevron-right']").click()
    time.sleep(6)
like image 756
GigaByte Avatar asked Oct 28 '22 09:10

GigaByte


1 Answers

Try below code to get required data without extra "sleeps":

import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException



# ----------------------------------------------HANDLING-SELENIUM-STUFF-------------------------------------------------
driver = webdriver.Chrome()
driver.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(driver, 8)

links = []

while True:
    new_links = wait.until(EC.visibility_of_all_elements_located((By.LINK_TEXT, "MORE INFO")))
    links.extend([link.get_attribute("href") for link in new_links])

    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "li[title='Next page']>a")))
        next_button.click()
    except TimeoutException:
        break
    wait.until(EC.staleness_of(new_links[-1]))

#  Do whatever you need with links 
like image 121
Andersson Avatar answered Nov 02 '22 14:11

Andersson