5

我试图让 Scrapy 在爬虫完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展,但我想将蜘蛛的错误附加<spidername>-errors.log<spidername>-items.json.

我已经将回调连接到每个信号,但由于某种原因,只有最后一个在触发:

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from StringIO import StringIO

class StatusMailer(object):
    def __init__(self, recipients, mail, crawler):
        self.recipients = recipients
        self.mail = mail
        self.files = defaultdict(StringIO)
        self.encoder = ScrapyJSONEncoder(crawler=crawler)

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')

    def spider_closed(self, spider):
        return self.mail.send(
            to=self.recipients,
            subject="Crawler for %s finished" % spider.name,
            body="",
            attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
        )

有什么方法可以从 Scrapy 中访问导出的项目和蜘蛛的错误(可能会在这些消息打印到控制台之前制作某种钩子来拦截这些消息)?

4

1 回答 1

9

好吧,看起来问题比我想象的要简单得多。StringIO在完全写完实例后,您必须“倒带”实例:

def spider_closed(self, spider):
    files = []

    for name, contents in self.files.items():
        contents.seek(0)

        files.append((name, 'text/plain', contents))

    return self.mail.send(
        to=self.recipients,
        subject="Crawler for %s finished" % spider.name,
        body="",
        attachs=files
    )

对于任何感兴趣的人,这是我的电子邮件扩展名:

import gzip
import datetime

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from StringIO import StringIO

def format_size(size):
    for x in ['bytes', 'KB', 'MB', 'GB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)

        size /= 1024.0

class GzipCompressor(gzip.GzipFile):
    extension = '.gz'
    mimetype = 'application/gzip'

    def __init__(self):
        super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
        self.read = self.fileobj.read

class PlainCompressor(StringIO):
    extension = ''
    mimetype = 'text/plain'

    def read(self, *args, **kwargs):
        self.seek(0)

        return StringIO.read(self, *args, **kwargs)

    @property
    def size(self):
        return len(self.getvalue())

class StatusMailer(object):
    def __init__(self, recipients, mail, compressor, crawler):
        self.recipients = recipients
        self.mail = mail
        self.encoder = ScrapyJSONEncoder(crawler=crawler)
        self.files = defaultdict(compressor)

        self.num_items = 0
        self.num_errors = 0

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(instance.request_received, signal=signals.request_received)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
        self.num_items += 1

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '.log'].write(failure.getTraceback())
        self.num_errors += 1

    def request_received(self, request, spider):
        self.files[spider.name + '.log'].write(str(request) + '\n')

    def spider_closed(self, spider, reason):
        files = []

        for name, compressed in self.files.items():
            files.append((name + compressed.extension, compressed.mimetype, compressed))

        try:
            size = self.files[spider.name + '-items.json'].size
        except KeyError:
            size = 0

        body='''Crawl statistics:

 - Spider name: {0}
 - Spider finished at: {1}
 - Number of items scraped: {2}
 - Number of errors: {3}
 - Size of scraped items: {4}'''.format(
            spider.name,
            datetime.datetime.now(),
            self.num_items,
            self.num_errors,
            format_size(size)
        )

        return self.mail.send(
            to=self.recipients,
            subject='Crawler for %s: %s' % (spider.name, reason),
            body=body,
            attachs=files
        )

将其添加到您的settings.py

EXTENSIONS = {
    'your_package.extensions.StatusMailer': 80
}

并配置它:

STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None

MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''
于 2013-05-02T05:09:26.780 回答