我在 scrapy 中创建了扩展来设置公共路径变量和其他东西。因此,在输出路径发生变化的情况下,只需要修改一个文件。但我无法访问蜘蛛内部的那条路径。
以下是扩展代码。
import datetime,re,os,random
from scrapy import signals
from scrapy.spider import Spider
from scrapy.conf import settings
class Common(object):
output_dir = ''
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
if settings['DATE']:
cls.output_dir = 'output/' + settings['DATE'] + '/'
else:
cls.output_dir = 'output/' + datetime.date.today().strftime('%Y-%m-%d') + '/'
上述扩展在设置中启用,如下所示
EXTENSIONS = {'scrapyproject.common.Common':500,}
我的蜘蛛代码如下
from scrapyproject.spiderCommon import *
class dmozSpider(CrawlSpider):
name = 'dmozSpider'
allowed_domains = ['www.dmoz.org']
start_urls = ['http://www.dmoz.org']
rules = (
Rule(SgmlLinkExtractor(allow=(),), callback='parse_item', follow=True),
)
def __init__(self, *a, **kw):
super(dmozSpider, self).__init__(self, *a, **kw)
dispatcher.connect(self.my_spider_opened, signals.spider_opened)
def parse_item(self, response):
sel = Selector(response)
vifUrls = sel.xpath('//ul[@class="directory dir-col"]/li/a/@href').extract()
with open(Common.output_dir + self.name + '.csv', 'a') as f:
for vifUrl in vifUrls:
print vifUrl
f.write("%s\n" % vifUrl)
pass
def my_spider_opened(self, spider):
fo = open(Common.output_dir + self.name + '.csv', "w+")
fo.truncate()
fo.close()
其中 spiderCommon 文件包含以下内容
from scrapyproject.common import *
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
Common.output_dir 的值在蜘蛛内部不可访问,但我可以在管道内访问它。
from scrapyproject.common import *
class XmlExportPipeline(object):
def __init__(self, **kwargs):
self.file_count = 1
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print Common.output_dir
def spider_closed(self, spider):
self.file_count = self.file_count + 1
def process_item(self, item, spider):
return item
当我尝试在蜘蛛上运行时,它卡在[scrapy] DEBUG: Web service listener listener on 0.0.0.0:6080,然后在不抓取任何链接的情况下完成。原因是它没有得到Common.output_dir的值 任何人都可以指出我哪里出错了?