I am new to Scrapy, I had the spider code
class Example_spider(BaseSpider):
name = "example"
allowed_domains = ["www.example.com"]
def start_requests(self):
yield self.make_requests_from_url("http://www.example.com/bookstore/new")
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//div[@class="bookListingBookTitle"]/a/@href').extract()
for i in urls:
yield Request(urljoin("http://www.example.com/", i[1:]), callback=self.parse_url)
def parse_url(self, response):
hxs = HtmlXPathSelector(response)
main = hxs.select('//div[@id="bookshelf-bg"]')
items = []
for i in main:
item = Exampleitem()
item['book_name'] = i.select('div[@class="slickwrap full"]/div[@id="bookstore_detail"]/div[@class="book_listing clearfix"]/div[@class="bookstore_right"]/div[@class="title_and_byline"]/p[@class="book_title"]/text()')[0].extract()
item['price'] = i.select('div[@id="book-sidebar-modules"]/div[@class="add_to_cart_wrapper slickshadow"]/div[@class="panes"]/div[@class="pane clearfix"]/div[@class="inner"]/div[@class="add_to_cart 0"]/form/div[@class="line-item"]/div[@class="line-item-price"]/text()').extract()
items.append(item)
return items
And pipeline code is:
class examplePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='blurb',
user='root',
passwd='redhat',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
# run db query in thread pool
assert isinstance(item, Exampleitem)
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
print "db connected-=========>"
# create record if doesn't exist.
tx.execute("select * from example_book_store where book_name = %s", (item['book_name']) )
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""INSERT INTO example_book_store (book_name,price)
VALUES (%s,%s)""",
(item['book_name'],item['price'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
After running this I am getting the following error
exceptions.NameError: global name 'Exampleitem' is not defined
I got the above error when I added the below code in process_item
method
assert isinstance(item, Exampleitem)
and without adding this line I am getting
**exceptions.TypeError: 'Example_spider' object is not subscriptable
Can anyone make this code run and make sure that all the items saved into database?
Try the following code in your pipeline
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd',
'dbname', charset="utf8",
use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_book_store (book_name, price)
VALUES (%s, %s)""",
(item['book_name'].encode('utf-8'),
item['price'].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Your process_item method should be declared as: def process_item(self, item, spider):
instead of def process_item(self, spider, item):
-> you switched the arguments around.
This exception: exceptions.NameError: global name 'Exampleitem' is not defined
indicates you didn't import the Exampleitem in your pipeline.
Try adding: from myspiders.myitems import Exampleitem
(with correct names/paths ofcourse).
I think this way is better and more concise:
#Item
class pictureItem(scrapy.Item):
topic_id=scrapy.Field()
url=scrapy.Field()
#SQL
self.save_picture="insert into picture(`url`,`id`) values(%(url)s,%(id)s);"
#usage
cur.execute(self.save_picture,dict(item))
It's just like
cur.execute("insert into picture(`url`,`id`) values(%(url)s,%(id)s)" % {"url":someurl,"id":1})
Cause (you can read more about Items in Scrapy)
The Field class is just an alias to the built-in dict class and doesn’t provide any extra functionality or attributes. In other words, Field objects are plain-old Python dicts.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With