我是 python 新手,在下面的代码中:我有一个在新发现的链接上递归的爬虫。在根链接上递归之后,似乎程序在打印了几个链接后就停止了,这应该会持续一段时间,但事实并非如此。我正在捕获并打印异常,但程序成功终止,所以我不确定它为什么停止。
from urllib import urlopen
from bs4 import BeautifulSoup
def crawl(url, seen):
try:
if any(url in s for s in seen):
return 0
html = urlopen(url).read()
soup = BeautifulSoup(html)
for tag in soup.findAll('a', href=True):
str = tag['href']
if 'http' in str:
print tag['href']
seen.append(str)
print "--------------"
crawl(str, seen)
except Exception, e:
print e
return 0
def main ():
print "$ = " , crawl("http://news.google.ca", [])
if __name__ == "__main__":
main()