I'm learning Python by trying to write a script to scrape xHamster. If anyone's familiar with the site, I'm trying to specifically write all URLs of a given user's videos to a .txt file.
Currently, I've managed to scrape the URLs off a specific page, however there are multiple pages and I'm struggling to loop through the number of pages.
In my attempt below I've commented where I'm trying to read the URL of the next page, however it current prints None. Any ideas why and how to resolve this?
Current script:
#!/usr/bin/env python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(chrome_options=chrome_options)
username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"
driver.implicitly_wait(10)
driver.get(url)
links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('last')
noOfLinks = len(links)
count = 0
file = open('x--' + username + '.txt','w')
while count < noOfLinks:
#print links[count].get_attribute('href')
file.write(links[count].get_attribute('href') + '\n');
count += 1
file.close()
driver.close()
My attempt at looping through the pages:
#!/usr/bin/env python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
driver = webdriver.Chrome(chrome_options=chrome_options)
username = **ANY_USERNAME**
##page = 1
url = "https://xhams***.com/user/video/" + username + "/new-1.html"
driver.implicitly_wait(10)
driver.get(url)
links = [];
links = driver.find_elements_by_class_name('hRotator')
#nextPage = driver.find_elements_by_class_name('colR')
## TRYING TO READ THE NEXT PAGE HERE
print driver.find_element_by_class_name('last').get_attribute('href')
noOfLinks = len(links)
count = 0
file = open('x--' + username + '.txt','w')
while count < noOfLinks:
#print links[count].get_attribute('href')
file.write(links[count].get_attribute('href') + '\n');
count += 1
file.close()
driver.close()
UPDATE:
I've used Philippe Oger's answer below but modified the two methods below to work for single page results:
def find_max_pagination(self):
start_url = 'https://www.xhamster.com/user/video/{}/new-1.html'.format(self.user)
r = requests.get(start_url)
tree = html.fromstring(r.content)
abc = tree.xpath('//div[@class="pager"]/table/tr/td/div/a')
if tree.xpath('//div[@class="pager"]/table/tr/td/div/a'):
self.max_page = max(
[int(x.text) for x in tree.xpath('//div[@class="pager"]/table/tr/td/div/a') if x.text not in [None, '...']]
)
else:
self.max_page = 1
return self.max_page
def generate_listing_urls(self):
if self.max_page == 1:
pages = [self.paginated_listing_page(str(page)) for page in range(0, 1)]
else:
pages = [self.paginated_listing_page(str(page)) for page in range(0, self.max_page)]
return pages
On a user page we can actually find out how far the pagination goes, so instead of looping though the pagination, we can generate each url of the user with a list comprehension, and then scraped those one by one.
Here are my two cents using LXML. If you simply copy/paste this code, it will return every video urls in a TXT file. You only need to change the user name.
from lxml import html
import requests
class XXXVideosScraper(object):
def __init__(self, user):
self.user = user
self.max_page = None
self.video_urls = list()
def run(self):
self.find_max_pagination()
pages_to_crawl = self.generate_listing_urls()
for page in pages_to_crawl:
self.capture_video_urls(page)
with open('results.txt', 'w') as f:
for video in self.video_urls:
f.write(video)
f.write('\n')
def find_max_pagination(self):
start_url = 'https://www.xhamster.com/user/video/{}/new-1.html'.format(self.user)
r = requests.get(start_url)
tree = html.fromstring(r.content)
try:
self.max_page = max(
[int(x.text) for x in tree.xpath('//div[@class="pager"]/table/tr/td/div/a') if x.text not in [None, '...']]
)
except ValueError:
self.max_page = 1
return self.max_page
def generate_listing_urls(self):
pages = [self.paginated_listing_page(page) for page in range(1, self.max_page + 1)]
return pages
def paginated_listing_page(self, pagination):
return 'https://www.xhamster.com/user/video/{}/new-{}.html'.format(self.user, str(pagination))
def capture_video_urls(self, url):
r = requests.get(url)
tree = html.fromstring(r.content)
video_links = tree.xpath('//a[@class="hRotator"]/@href')
self.video_urls += video_links
if __name__ == '__main__':
sample_user = 'wearehairy'
scraper = XXXVideosScraper(sample_user)
scraper.run()
I have not check the case when there is only 1 page in total for a user. Let me know if this works fine.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With