- 加载一个大的 URL 列表
- 使用请求的异步模块获取每个发出并发 HTTP 请求的 URL 的内容
- 使用 lxml 解析页面内容,以检查页面中是否存在链接
- 如果页面上存在链接,则将有关页面的一些信息保存在 ZODB 数据库中
当我用 4 或 5 个 URL 测试脚本时效果很好,脚本结束时我只有以下消息:
Exception KeyError: KeyError(45989520,) in <module 'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
但是当我尝试检查大约 24000 个 URL 时,它在列表末尾失败(当还有大约 400 个 URL 需要检查时)并出现以下错误:
Traceback (most recent call last):
File "check.py", line 95, in <module>
File "/home/alex/code/.virtualenvs/linka/local/lib/python2.7/site-packages/requests/async.py", line 83, in map
File "/home/alex/code/.virtualenvs/linka/local/lib/python2.7/site-packages/gevent-1.0b2-py2.7-linux-x86_64.egg/gevent/greenlet.py", line 405, in joinall
ImportError: No module named queue
Exception KeyError: KeyError(45989520,) in <module 'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
我尝试了pypi上可用的 gevent 版本以及从gevent repository下载和安装最新版本(1.0b2)。
我不明白为什么会发生这种情况,以及为什么只有当我检查一堆 URL 时才会发生这种情况。有什么建议么?
from requests import async, defaults
from lxml import html
from urlparse import urlsplit
from gevent import monkey
from BeautifulSoup import UnicodeDammit
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
import transaction
import persistent
import random
storage = FileStorage('Data.fs')
db = DB(storage)
connection = db.open()
root = connection.root()
defaults.defaults['base_headers']['User-Agent'] = "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
defaults.defaults['max_retries'] = 10
def save_data(source, target, anchor):
root[source] = persistent.mapping.PersistentMapping(dict(target=target, anchor=anchor))
def decode_html(html_string):
converted = UnicodeDammit(html_string, isHTML=True)
if not converted.unicode:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]",
', '.join(converted.triedEncodings))
# print converted.originalEncoding
return converted.unicode
def find_link(html_doc, url):
decoded = decode_html(html_doc)
doc = html.document_fromstring(decoded.encode('utf-8'))
for element, attribute, link, pos in doc.iterlinks():
if attribute == "href" and link.startswith('http'):
netloc = urlsplit(link).netloc
if "example.org" in netloc:
return (url, link, element.text_content().strip())
return False
def check(response):
if response.status_code == 200:
html_doc = response.content
result = find_link(html_doc, response.url)
if result:
source, target, anchor = result
# print "Source: %s" % source
# print "Target: %s" % target
# print "Anchor: %s" % anchor
# print
save_data(source, target, anchor)
global todo
todo = todo -1
print todo
def load_urls(fname):
with open(fname) as fh:
urls = set([url.strip() for url in fh.readlines()])
urls = list(urls)
return urls
if __name__ == "__main__":
urls = load_urls('urls.txt')
rs = []
todo = len(urls)
print "Ready to analyze %s pages" % len(urls)
for url in urls:
rs.append(async.get(url, hooks=dict(response=check), timeout=10.0))
responses = async.map(rs, size=100)
print "DONE."