我正在尝试使用 Twisted 制作一个相当简单的网络爬虫。我让它工作,但每当我尝试抓取超过几百个站点时,它会无缘无故地无限期挂起。一切似乎都有效,除非它在最后停止并有几个站点需要处理。
我在这里使用了教程:http: //technicae.cogitat.io/2008/06/async-batching-with-twisted-walkthrough.html作为蓝图。
这是我的代码:
class Spider:
"""Twisted-based html retrieval system."""
def __init__(self, queue, url_list):
self.process_queue = queue
self.start_urls = []
for url in url_list:
self.start_urls.append(url)
def crawl(self):
"""Extracts information from each website in start_urls."""
deferreds = []
sem = defer.DeferredSemaphore(30)
for url in self.start_urls:
d = sem.run(self._crawl, url, self.process_queue)
deferreds.append(d)
dl = defer.DeferredList(deferreds, consumeErrors=1)
dl.addCallback(self.finish, self.process_queue)
dl.addCallback(self.shutdown)
reactor.run()
def _crawl(self, url, queue):
d = getPage(url, timeout=10)
d.addCallback(self.parse, url, queue)
d.addErrback(self.parse_error, url, queue)
return d
def parse(self, result, url, queue):
print 'Parsing:', url
data = {'body': result, 'url': url}
response = Response(data['url'], data['body'])
queue.put(response)
return data
def parse_error(self, result, url, queue):
print 'Errback from:', url
data = {'body': 'error', 'url': url}
response = Response(data['url'], data['body'])
queue.put(response)
return data
def finish(self, results, queue):
for (valid, data) in results:
if valid:
print 'Success:', data['url']
else:
print 'Failed:', data['url']
finish_signal = Response('FINISHED', 'DONE')
queue.put(finish_signal)
def shutdown(self, ignore):
reactor.stop()
我在一个更大的程序中运行这部分代码,因此是队列。
有什么建议可以让 DeferredList 总是触发吗?或者关于为什么它只发射一半时间,而另一半没有任何例外地失败的想法?
这非常令人沮丧,特别是因为它可以完美地处理少量 URL(1-100),但在放大时会失败。我是 Twisted 的新手,所以我可能只是用错误提示搞砸了一些东西,但我不知道是什么,或者如何解决它......
此外,在任何人回答“使用 Scrapy!”之前 我不能使用 Scrapy,因为我不会在这里讨论。假设这个程序是我最后的希望并且必须工作。
编辑:
完整的独立代码,因此人们可以直接运行它:
import sys
from twisted.internet import defer, reactor
from twisted.web.client import getPage
class SeerSpider:
"""Twisted-based html retrieval system."""
def __init__(self, queue, url_list):
self.process_queue = queue
self.start_urls = []
for url in url_list:
self.start_urls.append(url)
def crawl(self):
"""Extracts information from each website in url_list."""
deferreds = []
sem = defer.DeferredSemaphore(30)
for url in self.start_urls:
d = sem.run(self._crawl, url, self.process_queue)
deferreds.append(d)
dl = defer.DeferredList(deferreds, consumeErrors=True)
dl.addCallback(self.finish, self.process_queue)
dl.addCallback(self.shutdown)
reactor.run()
def _crawl(self, url, queue):
d = getPage(url, timeout=10)
d.addCallback(self.parse, url, queue)
d.addErrback(self.parse_error, url, queue)
return d
def parse(self, result, url, queue):
data = {'body': result, 'url': url}
response = Response(data['url'], data['body'])
print response.url
return data
def parse_error(self, result, url, queue):
data = {'body': 'error','url': url}
response = Response(data['url'], data['body'])
print response.url
return data
def finish(self, results, queue):
finish_signal = Response('FINISHED', 'DONE')
print finish_signal.url
def shutdown(self, ignore):
reactor.stop()
class Response:
def __init__(self, url, text):
self.url = url
self.body = text
url_list = ['http://google.com/', 'http://example.com', 'http://facebook.com'] # this will work, make the list bigger to find the bug
spider = SeerSpider(None, url_list)
spider.crawl()