I get a Scrapy example from a website,it works but seems something wrong:it can not get all the content,and I don't know what happened. The example uses Scrapy+Redis+MongoDB.
the info:
2015-10-09 01:43:33 [scrapy] INFO: Crawled 292 pages (at 292 pages/min), scraped 291 items (at 291 items/min)
2015-10-09 01:44:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:45:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:46:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:47:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:48:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:49:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:50:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:51:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:52:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:53:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:54:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:55:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:56:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:57:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:58:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
novspider.py
#-*-coding:utf8-*-
from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import re
class novSpider(RedisSpider):
name = "novspider"
redis_key = 'nvospider:start_urls'
start_urls = ['http://www.daomubiji.com/']
def parse(self,response):
selector = Selector(response)
table = selector.xpath('//table')
for each in table:
bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
content = each.xpath('tr/td/a/text()').extract()
url = each.xpath('tr/td/a/@href').extract()
for i in range(len(url)):
item = NovelspiderItem()
item['bookName'] = bookName
item['chapterURL'] = url[i]
try:
item['bookTitle'] = content[i].split(' ')[0]
item['chapterNum'] = content[i].split(' ')[1]
except Exception,e:
continue
try:
item['chapterName'] = content[i].split(' ')[2]
except Exception,e:
item['chapterName'] = content[i].split(' ')[1][-3:]
yield Request(url[i], callback='parseContent', meta={'item':item})
def parseContent(self, response):
selector = Selector(response)
item = response.meta['item']
html = selector.xpath('//div[@class="content"]').extract()[0]
textField = re.search('<div style="clear:both"></div>(.*?)<div', html,re.S).group(1)
text = re.findall('<p>(.*?)</p>',textField,re.S)
fulltext = ''
for each in text:
fulltext += each
item['text'] = fulltext
yield item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for novelspider project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'novelspider'
SPIDER_MODULES = ['novelspider.spiders']
NEWSPIDER_MODULE = 'novelspider.spiders'
ITEM_PIPELINES = ['novelspider.pipelines.NovelspiderPipeline']
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'novdata'
MONGODB_DOCNAME = 'nov1'
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from items import NovelspiderItem
from scrapy.conf import settings
import pymongo
class NovelspiderPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Field, Item
class NovelspiderItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
bookName = Field()
bookTitle = Field()
chapterNum = Field()
chapterName = Field()
chapterURL = Field()
text = Field()
You never reach the parse method that way. Use this instead:
yield Request(
url[i],
callback=self.parseContent, # <--
meta={'item':item})
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With