我从 GAE 中的处理程序调用网络爬取函数,它检索一些图像,然后显示它们。它在第一次调用时工作得很好,但下一次它显示所有相同的图像并且爬虫从最后一个停止的地方启动。我认为这是我的全局变量未正确重置的问题。
每次我重新部署应用程序时,它都会第一次正确执行,但问题就开始了。
这是我的代码,如果您需要我澄清它,请告诉我,但我认为它应该有意义。
这是刮板功能
visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0
def scrape_pages(url, root_url, keywords=[], recurse=True):
#variables
max_count = 16
pic_num = 100
global count
global pic_count
global collected_pics
global collected_pages
print 'the keywords and url are'
print keywords
print url
#this is all of the links that have been scraped
the_links = []
soup = soupify_url(url)
#only add new pages onto the queue if the recursion argument is true
if recurse:
#find all the links on the page
try:
for tag in soup.findAll('a'):
the_links.append(tag.get('href'))
except AttributeError:
return
try:
external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
except TypeError:
return
#change it so this depends on the input
links_to_visit = external_links + internal_links + root_links
#build the queue
for link in links_to_visit:
if link not in visited_pages and link not in visit_queue:
visit_queue.append(link)
visited_pages.append(url)
count = count + 1
# print 'number of pages visited'
# print count
#add pages to collected_pages depending on the criteria given if any keywords are given
if keywords:
page_to_add = find_pages(url, soup, keywords)
# print 'page to add'
# print page_to_add
if page_to_add and page_to_add not in collected_pages:
collected_pages.append(page_to_add)
pics_to_add = add_pics(url, soup)
# print 'pics to add'
# print pics_to_add
if pics_to_add:
collected_pics.extend(pics_to_add)
#here is where the actual recursion happens by finishing the queue
while visit_queue:
if count >= max_count:
return
if pic_count > pic_num:
return
link = visit_queue.popleft()
# print link
scrape_pages(link, root_url, keywords)
# print '***done***'
###done with the recursive scraping function here
#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)
global collected_pics
global pic_count
global count
global visited_pages
global visit_queue
pic_count = 0
count = 0
visited_pages = []
visit_queue = deque([])
pics_to_return = collected_pics
collected_pics = []
return pics_to_return
这是调用刮板功能的处理程序
#this just simply displays the images
class Try(BlogHandler):
def get(self, keyword):
keyword = str(keyword)
keyword_list = keyword.split()
img_list = scraper.scrape_bing_src(keyword_list)
for img in img_list:
self.response.write("""<br><img src='""" + img + """'>""")
self.response.write('we are done here')