I have read all the threads on using scrapy for AJAX pages and installed selenium webdrive to simplify the task, my spider can partially crawl but can't get any data into my Items.
My objectives are:
Scrape each item(post)'s:
author_name (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/text())
author_page_url (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/@href)
post_title (xpath://a[@class="title_txt"])
post_page_url (xpath://a[@class="title_txt"]/@href)
post_text (xpath on a separate post page: //div[@id="a_NMContent/text()")
This is my monkey code (since I am only making my first steps in Python as an aspiring natural language processing student, who majored in linguistics in the past):
import scrapy
import time
from selenium import webdriver
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import XPathSelector
class ItalkiSpider(CrawlSpider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
# not sure if the rule is set correctly
rules = (Rule(LxmlLinkExtractor(allow="\entry"), callback = "parse_post", follow = True),)
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# adding necessary search parameters to the URL
self.driver.get(response.url+"#language=korean&author-language=russian&marks-min=-5&sort=1&page=1")
# pressing the "Show More" button at the bottom of the search results page to show the next 15 posts, when all results are loaded to the page, the button disappears
more_btn = self.driver.find_element_by_xpath('//a[@id="a_show_more"]')
while more_btn:
more_btn.click()
# sometimes waiting for 5 sec made spider close prematurely so keeping it long in case the server is slow
time.sleep(10)
# here is where the problem begins, I am making a list of links to all the posts on the big page, but I am afraid links will contain only the first link, because selenium doesn't do the multiple selection as one would expect from this xpath...how can I grab all the links and put them in the links list (and should I?)
links=self.driver.find_elements_by_xpath('/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li/div[2]/a')
for link in links:
link.click()
time.sleep(3)
# this is the function for parsing individual posts, called back by the *parse* method as specified in the rule of the spider; if it is correct, it should have saved at least one post into an item... I don't really understand how and where this callback function gets the response from the new page (the page of the post in this case)...is it automatically loaded to drive and then passed on to the callback function as soon as selenium has clicked on the link (link.click())? or is it all total nonsense...
def parse_post(self, response):
hxs = Selector(response)
item = ItalkiItem()
item["post_item"] = hxs.xpath('//div [@id="a_NMContent"]/text()').extract()
return item
Let's think about it a bit differently:
TextResponse
with the current page source (with all necessary posts loaded)Item
, yield a Request
to the post page and pass an item
instance from a request to a response in the meta
dictionaryNotes and changes I'm introducing:
Spider
class
spider_closed
signal dispatcherThe code:
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ItalkiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
text = scrapy.Field()
class ItalkiSpider(scrapy.Spider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get('http://www.italki.com/entries/korean')
while True:
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.ID, "a_show_more"))
)
more_btn.click()
# stop when we reach the desired page
if self.driver.current_url.endswith('page=52'):
break
# now scrapy should do the job
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
for post in response.xpath('//ul[@id="content"]/li'):
item = ItalkiItem()
item['title'] = post.xpath('.//a[@class="title_txt"]/text()').extract()[0]
item['url'] = post.xpath('.//a[@class="title_txt"]/@href').extract()[0]
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_post)
def parse_post(self, response):
item = response.meta['item']
item["text"] = response.xpath('//div[@id="a_NMContent"]/text()').extract()
return item
This is something you should use as a base code and improve to fill out all other fields, like author
or author_url
. Hope that helps.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With