Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Writing items to a MySQL database in Scrapy

I am new to Scrapy, I had the spider code

class Example_spider(BaseSpider):
   name = "example"
   allowed_domains = ["www.example.com"]

   def start_requests(self):
       yield self.make_requests_from_url("http://www.example.com/bookstore/new")

   def parse(self, response):
       hxs = HtmlXPathSelector(response)
       urls = hxs.select('//div[@class="bookListingBookTitle"]/a/@href').extract()
       for i in urls:
           yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)

   def parse_url(self, response):
           hxs = HtmlXPathSelector(response)
           main =   hxs.select('//div[@id="bookshelf-bg"]')
           items = []
           for i in main:
           item = Exampleitem()
           item['book_name'] = i.select('div[@class="slickwrap full"]/div[@id="bookstore_detail"]/div[@class="book_listing clearfix"]/div[@class="bookstore_right"]/div[@class="title_and_byline"]/p[@class="book_title"]/text()')[0].extract()
           item['price'] = i.select('div[@id="book-sidebar-modules"]/div[@class="add_to_cart_wrapper slickshadow"]/div[@class="panes"]/div[@class="pane clearfix"]/div[@class="inner"]/div[@class="add_to_cart 0"]/form/div[@class="line-item"]/div[@class="line-item-price"]/text()').extract()
           items.append(item)
       return items

And pipeline code is:

class examplePipeline(object):

    def __init__(self):               
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db='blurb',
                user='root',
                passwd='redhat',
                cursorclass=MySQLdb.cursors.DictCursor,
                charset='utf8',
                use_unicode=True
            )
def process_item(self, spider, item):
    # run db query in thread pool
    assert isinstance(item, Exampleitem)
    query = self.dbpool.runInteraction(self._conditional_insert, item)
    query.addErrback(self.handle_error)
    return item
def _conditional_insert(self, tx, item):
    print "db connected-=========>"
    # create record if doesn't exist. 
    tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
    result = tx.fetchone()
    if result:
        log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
    else:
        tx.execute("""INSERT INTO example_book_store (book_name,price)
                    VALUES (%s,%s)""",   
                            (item['book_name'],item['price'])
                    )
        log.msg("Item stored in db: %s" % item, level=log.DEBUG)            

def handle_error(self, e):
    log.err(e)          

After running this I am getting the following error

exceptions.NameError: global name 'Exampleitem' is not defined

I got the above error when I added the below code in process_item method

assert isinstance(item, Exampleitem)

and without adding this line I am getting

**exceptions.TypeError: 'Example_spider' object is not subscriptable

Can anyone make this code run and make sure that all the items saved into database?

like image 782
Shiva Krishna Bavandla Avatar asked Jun 01 '12 07:06

Shiva Krishna Bavandla


3 Answers

Try the following code in your pipeline

import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request

class MySQLStorePipeline(object):
    def __init__(self):
        self.conn = MySQLdb.connect('host', 'user', 'passwd', 
                                    'dbname', charset="utf8",
                                    use_unicode=True)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):    
        try:
            self.cursor.execute("""INSERT INTO example_book_store (book_name, price)  
                        VALUES (%s, %s)""", 
                       (item['book_name'].encode('utf-8'), 
                        item['price'].encode('utf-8')))            
            self.conn.commit()            
        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
        return item
like image 88
Mahmoud M. Abdel-Fattah Avatar answered Oct 07 '22 11:10

Mahmoud M. Abdel-Fattah


Your process_item method should be declared as: def process_item(self, item, spider): instead of def process_item(self, spider, item): -> you switched the arguments around.

This exception: exceptions.NameError: global name 'Exampleitem' is not defined indicates you didn't import the Exampleitem in your pipeline. Try adding: from myspiders.myitems import Exampleitem (with correct names/paths ofcourse).

like image 31
Sjaak Trekhaak Avatar answered Oct 07 '22 13:10

Sjaak Trekhaak


I think this way is better and more concise:

#Item
class pictureItem(scrapy.Item):
    topic_id=scrapy.Field()
    url=scrapy.Field()

#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"

#usage
cur.execute(self.save_picture,dict(item))

It's just like

cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})

Cause (you can read more about Items in Scrapy)

The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.

like image 1
FavorMylikes Avatar answered Oct 07 '22 13:10

FavorMylikes