出于学术和性能的考虑,鉴于这种爬网递归网络爬网功能(仅在给定域内爬网),使其迭代运行的最佳方法是什么?目前,当它运行时,当它完成时,python 已经使用超过 1GB 的内存,这对于在共享环境中运行是不可接受的。
def crawl(self, url):
"Get all URLS from which to scrape categories."
try:
links = BeautifulSoup(urllib2.urlopen(url)).findAll(Crawler._match_tag)
except urllib2.HTTPError:
return
for link in links:
for attr in link.attrs:
if Crawler._match_attr(attr):
if Crawler._is_category(attr):
pass
elif attr[1] not in self._crawled:
self._crawled.append(attr[1])
self.crawl(attr[1])