我试图让 Scrapy 在爬虫完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展,但我想将蜘蛛的错误附加<spidername>-errors.log
为<spidername>-items.json
.
我已经将回调连接到每个信号,但由于某种原因,只有最后一个在触发:
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
class StatusMailer(object):
def __init__(self, recipients, mail, crawler):
self.recipients = recipients
self.mail = mail
self.files = defaultdict(StringIO)
self.encoder = ScrapyJSONEncoder(crawler=crawler)
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')
def spider_error(self, failure, response, spider):
self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')
def spider_closed(self, spider):
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
)
有什么方法可以从 Scrapy 中访问导出的项目和蜘蛛的错误(可能会在这些消息打印到控制台之前制作某种钩子来拦截这些消息)?