Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

scraping multiple pages with scrapy

I am trying to use scrapy to scrape a website that has several pages of information.

my code is:

from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item


class MySpider(BaseSpider):
    name = "tcg"
    allowed_domains = ["http://www.tcgplayer.com/"]
    start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]

    def parse(self, response):
        hxs = Selector(response)
        titles = hxs.xpath("//div[@class='magicCard']")
        for title in titles:
            item = Tcgplayer1Item()
            item["cardname"] = title.xpath(".//li[@class='cardName']/a/text()").extract()[0]

            vendor = title.xpath(".//tr[@class='vendor ']")
            item["price"] = vendor.xpath("normalize-space(.//td[@class='price']/text())").extract()
            item["quantity"] = vendor.xpath("normalize-space(.//td[@class='quantity']/text())").extract()
            item["shipping"] = vendor.xpath("normalize-space(.//span[@class='shippingAmount']/text())").extract()
            item["condition"] = vendor.xpath("normalize-space(.//td[@class='condition']/a/text())").extract()
            item["vendors"] = vendor.xpath("normalize-space(.//td[@class='seller']/a/text())").extract()
            yield item

I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.

like image 360
six7zero9 Avatar asked May 27 '14 19:05

six7zero9


1 Answers

The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:

from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item


URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"

class MySpider(BaseSpider):
    name = "tcg"
    allowed_domains = ["tcgplayer.com"]
    start_urls = [URL % 1]

    def __init__(self):
        self.page_number = 1

    def parse(self, response):
        print self.page_number
        print "----------"

        sel = Selector(response)
        titles = sel.xpath("//div[@class='magicCard']")
        if not titles:
            raise CloseSpider('No more pages')

        for title in titles:
            item = Tcgplayer1Item()
            item["cardname"] = title.xpath(".//li[@class='cardName']/a/text()").extract()[0]

            vendor = title.xpath(".//tr[@class='vendor ']")
            item["price"] = vendor.xpath("normalize-space(.//td[@class='price']/text())").extract()
            item["quantity"] = vendor.xpath("normalize-space(.//td[@class='quantity']/text())").extract()
            item["shipping"] = vendor.xpath("normalize-space(.//span[@class='shippingAmount']/text())").extract()
            item["condition"] = vendor.xpath("normalize-space(.//td[@class='condition']/a/text())").extract()
            item["vendors"] = vendor.xpath("normalize-space(.//td[@class='seller']/a/text())").extract()
            yield item

        self.page_number += 1
        yield Request(URL % self.page_number)

This particular spider would go throw all 8 pages of the data, then stop.

Hope that helps.

like image 177
alecxe Avatar answered Oct 03 '22 08:10

alecxe