I want to:
Currently my spider is like this:
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
nextPage = obtainNextPage()
if nextPage:
yield scrapy.Request(url=next_url, callback=self.parse_next)
The problem is for a set of links that the spider processed, the spider could only reach 'next page' for the last link of that set of links, I viewed that through selenium + chromedriver. For example, I have 10 links(from No.1 to No.10), my spider could only get the next pages for the No.10 link. I don't know if the problem occurred was because of some structural problem of my spider. Below is the full code:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(15)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')[:2]:
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_next)
# checked
def pageScroll(self, url):
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_next(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
try: reply = sel.find_element_by_css_selector('ul.j_lzl_m_w').text
except: reply = ''
yield {'name': name, 'content': content, 'reply': reply}
#follow to next page
next_sel = self.driver.find_element_by_link_text("下一页")
next_url_name = next_sel.text
if next_sel and next_url_name == '下一页':
next_url = next_sel.get_attribute('href')
yield scrapy.Request(url=next_url, callback=self.parse_next)
Thanks for your help, and welcome any suggestions referring my code above
In reference to scraping content from one page, store it, and allow the spider to continue the crawl to the scrape and store items on subsequent pages. You should be configuring your items.py file with the item names and pass the items through each scrapy.Request using a meta.
You should check out https://github.com/scrapy/scrapy/issues/1138
To illustrate how this works, it goes something like this... 1. First, we set up the item.py file with the total items to be scraped on every page.
#items.py
import scrapy
class ScrapyProjectItem(scrapy.Item):
page_one_item = scrapy.Field()
page_two_item = scrapy.Field()
page_three_item = scrapy.Field()
Then its importing the items.py item class to you scrapy spider.
from scrapyproject.items import ScrapyProjectItem
The in your scraper, through each page iteration that has content you want, its initializing the items.py class the pass the items using 'meta' to the next request.
#spider.py
def parse(self, response):
# Initializing the item class
item = ScrapyProjectItem()
# Itemizing the... item lol
item['page_one_item'] = response.css("etcetc::").extract() # set desired attribute
# Here we pass the items to the next concurrent request
for url in someurls: # Theres a million ways to skin a cat, dont know your exact use case.
yield scrapy.Request(response.urljoin(url),
callback=self.parse_next, meta={'item': item})
def parse_next(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_two_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
def parse_again(self, response):
# We load the meta from the previous request
item = response.meta['item']
# We itemize
item['page_three_item'] = response.css("etcetc::").extract()
# We pass meta again to next request
for url in someurls:
yield scrapy.Request(response.urljoin(url),
callback=self.parse_again, meta={'item': item})
# At the end of each iteration of the crawl loop we can yield the result
yield item
As to the problem about crawler only reaching the last link, I would like to have more info instead of guessing what the problem could be. In your "parse_next", you should add a "print(response.url)" to see if the pages are being reached at all? Im sorry if I didnt understand your problem and wasted everyones time lol.
I think I understand better you issue ... You have a list of urls, and each urls has its own set of urls yes?
In your code, the "obtainNextPage()" might be the issue? I have in the past when encountering this type of case have had to use some xpath and/or regex magic to properly obtain the next pages. Im not sure what "obtainNextPage" is doing but... have you thought of parsing the content and use selector to find the next page?? For example.
class mySpider(scrapy.Spider):
...
def parse(self, response):
for url in someurls:
yield scrapy.Request(url=url, callback=self.parse_next)
def parse_next(self, response):
for selector in someselectors:
yield { 'contents':...,
...}
#nextPage = obtainNextPage()
next_page = response.xpath('//path/to/nextbutton/orPage'):
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
callback=self.parse_next)
You should still add that "print(response.url)" to see if the url thats being requested is being called correctly, might be urljoin issue.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With