I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result. Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('@data-url').extract()
item['sku'] = product.xpath('@data-sku').extract()
item['brand'] = product.xpath('.//span[contains(@class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(@class, "itm-img")]/@src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[@id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.
Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./@data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/@unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(@class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[@class="lazyImage cat-prd-img"]/span/@id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./@data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./@data-url').extract()
item['sku'] = product.xpath('./a/@unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(@class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[@class="lazyImage cat-prd-img"]/span/@id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[@data-url]")
if products:
for product in products:
link = product.xpath('@data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('@data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(@class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(@class, "itm-img")]/@src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With