不知道你是否还需要这个,但我已经整理了一个例子。如果您有一个特定的网站,我们都可以看看它。
from scrapy.http import Request
from scrapy.spider import BaseSpider
class TestSpider(BaseSpider):
name = "TEST"
allowed_domains = ["example.com", "example.iana.org"]
def __init__(self, **kwargs):
super( TestSpider, self ).__init__(**kwargs)\
self.url = "http://www.example.com"
self.max_loop = 3
self.loop = 0 # We want it to loop 3 times so keep a class var
def start_requests(self):
# I'll write it out more explicitly here
print "OPEN"
checkRequest = Request(
url = self.url,
meta = {"test":"first"},
callback = self.checker
)
return [ checkRequest ]
def checker(self, response):
# I wasn't sure about a specific website that gives 302
# so I just used 200. We need the loop counter or it will keep going
if(self.loop<self.max_loop and response.status==200):
print "RELOOPING", response.status, self.loop, response.meta['test']
self.loop += 1
checkRequest = Request(
url = self.url,
callback = self.checker
).replace(meta = {"test":"not first"})
return [checkRequest]
else:
print "END LOOPING"
self.results(response) # No need to return, just call method
def results(self, response):
print "DONE" # Do stuff here
在settings.py中,设置这个选项
DUPEFILTER_CLASS = 'scrapy.dupefilter.BaseDupeFilter'
这实际上是关闭重复站点请求过滤器的原因。这很令人困惑,因为 BaseDupeFilter 实际上不是默认值,因为它并没有真正过滤任何东西。这意味着我们将提交 3 个不同的请求,这些请求将循环通过检查器方法。另外,我正在使用scrapy 0.16:
>scrapy crawl TEST
>OPEN
>RELOOPING 200 0 first
>RELOOPING 200 1 not first
>RELOOPING 200 2 not first
>END LOOPING
>DONE