I'm trying to get Scrapy to send me an email when a crawler finishes or breaks. There's already a built-in extension for sending stats, but I'd like to attach the spider's errors as <spidername>-errors.log
and the scraped items as <spidername>-items.json
.
I've connected callbacks to each of the signals, but for some reason only the last one is firing:
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
class StatusMailer(object):
def __init__(self, recipients, mail, crawler):
self.recipients = recipients
self.mail = mail
self.files = defaultdict(StringIO)
self.encoder = ScrapyJSONEncoder(crawler=crawler)
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')
def spider_error(self, failure, response, spider):
self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')
def spider_closed(self, spider):
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
)
Is there any way to access the exported items and the spider's errors from within Scrapy (possibly making some kind of hook to intercept those messages before they're printed to the console)?
Well, it looks like the problem was much simpler than I had thought. You have to "rewind" StringIO
instances after you're completely done writing to them:
def spider_closed(self, spider):
files = []
for name, contents in self.files.items():
contents.seek(0)
files.append((name, 'text/plain', contents))
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=files
)
For anyone that's interested, here's my email extension:
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
@property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder(crawler=crawler)
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(instance.request_received, signal=signals.request_received)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
self.num_items += 1
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def request_received(self, request, spider):
self.files[spider.name + '.log'].write(str(request) + '\n')
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension, compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body='''Crawl statistics:
- Spider name: {0}
- Spider finished at: {1}
- Number of items scraped: {2}
- Number of errors: {3}
- Size of scraped items: {4}'''.format(
spider.name,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)
Add it to your settings.py
:
EXTENSIONS = {
'your_package.extensions.StatusMailer': 80
}
And configure it:
STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None
MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With