好吧,我遇到了同样的问题,所以我使用动态创建了蜘蛛类type()
,
from scrapy.contrib.spiders import CrawlSpider
import urlparse
class GenericSpider(CrawlSpider):
"""a generic spider, uses type() to make new spider classes for each domain"""
name = 'generic'
allowed_domains = []
start_urls = []
@classmethod
def create(cls, link):
domain = urlparse.urlparse(link).netloc.lower()
# generate a class name such that domain www.google.com results in class name GoogleComGenericSpider
class_name = (domain if not domain.startswith('www.') else domain[4:]).title().replace('.', '') + cls.__name__
return type(class_name, (cls,), {
'allowed_domains': [domain],
'start_urls': [link],
'name': domain
})
所以说,要为“ http://www.google.com ”创建一个蜘蛛,我会做 -
In [3]: google_spider = GenericSpider.create('http://www.google.com')
In [4]: google_spider
Out[4]: __main__.GoogleComGenericSpider
In [5]: google_spider.name
Out[5]: 'www.google.com'
希望这可以帮助