I want to crawl this website. I have written a spider but it is only crawling the front page, i.e. the top 52 items.
I have tried this code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
a=[]
from aqaq.items import aqaqItem
import os
import urlparse
import ast
class aqaqspider(BaseSpider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = [
"http://www.jabong.com/women/clothing/womens-tops/",
]
def parse(self, response):
# ... Extract items in the page using extractors
n=3
ct=1
hxs = HtmlXPathSelector(response)
sites=hxs.select('//div[@id="page"]')
for site in sites:
name=site.select('//div[@id="content"]/div[@class="l-pageWrapper"]/div[@class="l-main"]/div[@class="box box-bgcolor"]/section[@class="box-bd pan mtm"]/ul[@id="productsCatalog"]/li/a/@href').extract()
print name
print ct
ct=ct+1
a.append(name)
req= Request (url="http://www.jabong.com/women/clothing/womens-tops/?page=" + str(n) ,
headers = {"Referer": "http://www.jabong.com/women/clothing/womens-tops/",
"X-Requested-With": "XMLHttpRequest"},callback=self.parse,dont_filter=True)
return req # and your items
It is showing following output:
2013-10-31 09:22:42-0500 [jabong] DEBUG: Crawled (200) <GET http://www.jabong.com/women/clothing/womens-tops/?page=3> (referer: http://www.jabong.com/women/clothing/womens-tops/)
2013-10-31 09:22:42-0500 [jabong] DEBUG: Filtered duplicate request: <GET http://www.jabong.com/women/clothing/womens-tops/?page=3> - no more duplicates will be shown (see DUPEFILTER_CLASS)
2013-10-31 09:22:42-0500 [jabong] INFO: Closing spider (finished)
2013-10-31 09:22:42-0500 [jabong] INFO: Dumping Scrapy stats:
When I put dont_filter=True
it will never stop.
Yes, dont_filter
has to be used here since there is only page
GET parameter changing in the XHR request to http://www.jabong.com/women/clothing/womens-tops/?page=X
each time you scroll the page down to bottom.
Now you need to figure out how to stop crawling. This is actually simple - just check when there is no products on the next page in the queue and raise CloseSpider
exception.
Here is a complete code example that works for me (stops at page number 234):
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import BaseSpider
from scrapy.http import Request
class Product(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
class aqaqspider(BaseSpider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = [
"http://www.jabong.com/women/clothing/womens-tops/?page=1",
]
page = 1
def parse(self, response):
products = response.xpath("//li[@data-url]")
if not products:
raise CloseSpider("No more products!")
for product in products:
item = Product()
item['brand'] = product.xpath(".//span[contains(@class, 'qa-brandName')]/text()").extract()[0].strip()
item['title'] = product.xpath(".//span[contains(@class, 'qa-brandTitle')]/text()").extract()[0].strip()
yield item
self.page += 1
yield Request(url="http://www.jabong.com/women/clothing/womens-tops/?page=%d" % self.page,
headers={"Referer": "http://www.jabong.com/women/clothing/womens-tops/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
you can try this code, a slight variation from alecxe
's code,
If no products are there then simply return
from the function and which ultimately leads to close the spider. Simple solution.
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class aqaqItem(scrapy.Item):
brand = scrapy.Field()
title = scrapy.Field()
class aqaqspider(Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/womens-tops/?page=1"]
page_index = 1
def parse(self, response):
products = response.xpath("//li[@data-url]")
if products:
for product in products:
brand = product.xpath(
".//span[contains(@class, 'qa-brandName')]/text()").extract()
brand = brand[0].strip() if brand else 'N/A'
title = product.xpath(
".//span[contains(@class, 'qa-brandTitle')]/text()").extract()
title = title[0].strip() if title else 'N/A'
item = aqaqItem()
item['brand']=brand,
item['title']=title
yield item
# here if no products are available , simply return, means exiting from
# parse and ultimately stops the spider
else:
return
self.page_index += 1
if page_index:
yield Request(url="http://www.jabong.com/women/clothing/womens-tops/?page=%s" % (self.page_index + 1),
callback=self.parse)
even though the spider yields more than 12.5k products it contains lots of duplicate entries , I have made an ITEM_PIPELINE
which will remove the duplicate entries and insert into the mongodb.
pipeline code below,
from pymongo import MongoClient
class JabongPipeline(object):
def __init__(self):
self.db = MongoClient().jabong.product
def isunique(self, data):
return self.db.find(data).count() == 0
def process_item(self, item, spider):
if self.isunique(dict(item)):
self.db.insert(dict(item))
return item
and attaching the scrapy log status here
2015-04-19 10:00:58+0530 [jabong] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 426231,
'downloader/request_count': 474,
'downloader/request_method_count/GET': 474,
'downloader/response_bytes': 3954822,
'downloader/response_count': 474,
'downloader/response_status_count/200': 235,
'downloader/response_status_count/301': 237,
'downloader/response_status_count/302': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 4, 19, 4, 30, 58, 710487),
'item_scraped_count': 12100,
'log_count/DEBUG': 12576,
'log_count/INFO': 11,
'request_depth_max': 234,
'response_received_count': 235,
'scheduler/dequeued': 474,
'scheduler/dequeued/memory': 474,
'scheduler/enqueued': 474,
'scheduler/enqueued/memory': 474,
'start_time': datetime.datetime(2015, 4, 19, 4, 26, 17, 867079)}
2015-04-19 10:00:58+0530 [jabong] INFO: Spider closed (finished)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With