class Crawler1(object):
def __init__(self):
'constructor'
self.visited = []
self.will_visit = []
def reset(self):
'reset the visited links'
self.visited = []
self.will_visit = []
def crawl(self, url, n):
'crawl to depth n starting at url'
self.analyze(url)
if n < 0:
self.reset()
elif url in self.visted:
self.crawl(self.will_visit[-1],n-1)
else:
self.visited.append(url)
self.analyze(url)
self.visited.append(url)
self.will_visit.pop(-1)
self.crawl(self.will_visit[-1],n-1)
def analyze(self, url):
'returns the list of URLs found in the page url'
print("Visiting", url)
content = urlopen(url).read().decode()
collector = Collector(url)
collector.feed(content)
urls = collector.getLinks()
for i in urls:
if i in self.will_visit:
pass
else:
self.will_visit.append(i)
我希望该程序通过一系列链接运行,但仅在“n”允许的范围内运行
我不确定代码有什么问题,尽管我确信它已经足够了。一些提示会很好。
如果 n = 1 并且在 Site1 上存在 Site2 和 Site3 的链接,则预期输出:
Visiting [Site1]
Visiting [Site2]
Visiting [Site3]