Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Scrapy exception - exceptions.AttributeError: 'unicode' object has no attribute 'select'

Tags:

python

scrapy

I have written a spider but whenever i run this spider i am getting this error:

Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
    call.func(*call.args, **call.kw)
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 607, in _tick
    taskObj._oneWorkUnit()
  File "/usr/local/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
    result = next(self._iterator)
  File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
    work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
  File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
    yield it.next()
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 28, in process_spider_output
    for x in result:
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/home/vaibhav/scrapyprog/comparison/eScraperInterface/eScraper/spiders/streetstylestoreSpider.py", line 38, in parse
    item['productURL'] = site.select('.//a/@href').extract()
exceptions.AttributeError: 'unicode' object has no attribute 'select'

My code is :

from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

#------------------------------------------------------------------------------ 

class ESpider(CrawlSpider):

    name = "streetstylestoreSpider"
    allowed_domains = ["streetstylestore.com"]    

    start_urls = [
                  "http://streetstylestore.com/index.php?id_category=16&controller=category",
                  "http://streetstylestore.com/index.php?id_category=46&controller=category",
                  "http://streetstylestore.com/index.php?id_category=51&controller=category",
                  "http://streetstylestore.com/index.php?id_category=61&controller=category",
                  "http://streetstylestore.com/index.php?id_category=4&controller=category"
                  ]


    def parse(self, response):                  

        items = []
        hxs = HtmlXPathSelector(response)        
        sites = hxs.select('//ul[@id="product_list"]/li').extract()       

        for site in sites:

            item = EscraperItem()        
            item['currency'] = 'INR'
            item['productSite'] = ["http://streetstylestore.com"]
            item['productURL'] = site.select('.//a/@href').extract()            
            item['productImage'] = site.select('.//a/img/@src').extract()                    
            item['productTitle'] = site.select('.//a/@title').extract()            
            productMRP = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//span[@class="old_price"]/text()').extract()]
            productPrice = [i.strip().split('Rs')[-1].replace(',','') for i in hxs.select('.//div[@class="price_container"]//p[@class="price"]/text()').extract()]
            item['productPrice'] = productMRP + productPrice                       

            items.append(item)
            secondURL = item['productURL'][0]
            request = Request(secondURL,callback=self.parsePage2)
            request.meta['item'] = item
            yield request


    def parsePage2(self, response):

        temp = []                
        item = response.meta['item']
        hxs = HtmlXPathSelector(response)

        availability =  [i for i in hxs.select('//div[@class="details"]/p/text()').extract() if 'In Stock ' in i]

        if  availability:
            item['availability'] = True
        else:
            item['availability'] = False

        hasVariants =  hxs.select('//div[@class="attribute_list"]').extract()

        if hasVariants:            
            item['hasVariants'] = True
        else:
            item['hasVariants'] = False

        category = hxs.select('//div[@class="breadcrumb"]/a/text()').extract()
        if category:
            productCategory = [category[0]]
            if len(category) >= 1:
                productSubCategory = [category[1]]
            else:
                productSubCategory = ['']
        else:            
            productCategory = ['']
            productSubCategory = ['']

        item['productCategory'] = productCategory       
        item['productSubCategory'] = productSubCategory

        for i in hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract():
            temp.append(i.replace("medium","large"))

        item['productDesc'] =  " ".join([i for i in hxs.select('//div[@id="short_description_content"]/p/text()').extract()])
        item['productImage'] = item['productImage'] + hxs.select('//div[@id="thumbs_list"]/ul/li/a/img/@src').extract() + hxs.select('//div[@id="thumbs_list"]/ul/li/a/@href').extract() + temp   
        item['image_urls'] = list(set(item['productImage']))        

        return item

Can someone please tell me whats wrong with my code...

like image 261
Vaibhav Jain Avatar asked Jun 24 '13 04:06

Vaibhav Jain


1 Answers

Don't call .extract() on what you store in sites - extract() returns text, but you don't want text from it yet. This...

sites = hxs.select('//ul[@id="product_list"]/li').extract()

...should be this:

sites = hxs.select('//ul[@id="product_list"]/li')
like image 83
Amber Avatar answered Nov 01 '22 02:11

Amber