I'm trying to run scrapy (spider/crawl)from django project (task in the admin interrface using celery). this is my code . this is the error when I try to call the task from a python shell
djangoproject:
-monapp:
-tasks.py
-spider.py
-myspider.py '
-models.py
.....
tasks.py:
from djcelery import celery
from demoapp.spider import *
from demoapp.myspider import *
@celery.task
def add(x, y):
return x + y
@celery.task
def scra():
result_queue = Queue()
crawler = CrawlerWorker(MySpider(), result_queue)
crawler.start()
return "success"
spider.py:
from scrapy import project, signals
from scrapy.settings import Settings
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = Crawler(Settings())
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
myspider.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
class TorentItem(Item):
title = Field()
desc = Field()
class MySpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com']
start_urls = [\
'http://tanitjobs.com/browse-by-category/Nurse/',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',)
,restrict_xpaths=('//div[@class="pageNavigation"]',),
unique = True)
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('\
//div[@class="offre"]/div[@class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item['title']=item.select(\
'a/strong/text()').extract()
scraped_item['desc'] =item.select(\
'./div[@class="descriptionjob"]/text()').extract()
scraped_items.append(scraped_item)
return scraped_items
I got it work mine on the shell using django management command. Below is my code snippet. Feel free to modify to fit your needs.
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from myspiderproject.spiders.myspider import MySpider
class ReactorControl:
def __init__(self):
self.crawlers_running = 0
def add_crawler(self):
self.crawlers_running += 1
def remove_crawler(self):
self.crawlers_running -= 1
if self.crawlers_running == 0:
reactor.stop()
def setup_crawler(domain):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)
spider = MySpider(domain=domain)
crawler.crawl(spider)
reactor_control.add_crawler()
crawler.start()
reactor_control = ReactorControl()
class Command(BaseCommand):
help = 'Crawls the site'
def handle(self, *args, **options):
setup_crawler('somedomain.com')
reactor.run() # the script will block here until the spider_closed signal was sent
hope this helps.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With