我试图在单独的脚本中执行scrapy spider,当我在循环中执行这个脚本时(例如使用不同的参数运行同一个spider),我得到ReactorAlreadyRunning
. 我的片段:
from celery import task
from episode.skywalker.crawlers import settings
from multiprocessing.queues import Queue
from scrapy import log, project, signals
from scrapy.settings import CrawlerSettings
from scrapy.spider import BaseSpider
from scrapy.spidermanager import SpiderManager
from scrapy.xlib.pydispatch import dispatcher
import multiprocessing
from twisted.internet.error import ReactorAlreadyRunning
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
from scrapy.crawler import CrawlerProcess
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = CrawlerProcess(CrawlerSettings(settings))
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
try:
self.crawler.start()
except ReactorAlreadyRunning:
pass
self.crawler.stop()
self.result_queue.put(self.items)
@task
def execute_spider(spider, **spider__kwargs):
'''
Execute spider within separate process
@param spider: spider class to crawl or the name (check if instance)
'''
if not isinstance(spider, BaseSpider):
manager = SpiderManager(settings.SPIDER_MODULES)
spider = manager.create(spider, **spider__kwargs)
result_queue = Queue()
crawler = CrawlerWorker(spider, result_queue)
crawler.start()
items = []
for item in result_queue.get():
items.append(item)
我的建议是它是由多个扭曲的反应堆运行引起的。我怎样才能避免它?一般来说,有没有办法在没有反应器的情况下运行蜘蛛?