我正在尝试遍历 Reddit 上的多篇文章,浏览每篇文章并提取最相关的实体(通过过滤最高相关性分数来完成),然后将其添加到master_locations
列表中:
from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup
alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'
urls = []
locations = []
relevance = []
master_locations = []
def get_all_links(page):
html = urllib2.urlopen(page).read()
soup = BeautifulSoup(html)
for a in soup.find_all('a', 'title may-blank ', href=True):
urls.append(a['href'])
run_alchemy_entity_per_link(a['href'])
def run_alchemy_entity_per_link(articleurl):
response = alchemyapi.entities('url', articleurl)
if response['status'] == 'OK':
for entity in response['entities']:
if entity['type'] in entity == 'Country' or entity['type'] == 'Region' or entity['type'] == 'City' or entity['type'] == 'StateOrCountry' or entity['type'] == 'Continent':
if entity.get('disambiguated'):
locations.append(entity['disambiguated']['name'])
relevance.append(entity['relevance'])
else:
locations.append(entity['text'])
relevance.append(entity['relevance'])
else:
locations.append('No Location')
relevance.append('0')
max_pos = relevance.index(max(relevance)) # get nth position of the highest relevancy score
master_locations.append(locations[max_pos]) #Use n to get nth position of location and store that location name to master_locations
del locations[0] # RESET LIST
del relevance[0] # RESET LIST
else:
print('Error in entity extraction call: ', response['statusInfo'])
get_all_links('http://www.reddit.com/r/worldnews') # Gets all URLs per article, then analyzes entity
for item in master_locations:
print(item)
但我认为出于某种原因,列表locations
并relevance
没有被重置。我做错了吗?
打印出来的结果是:
Holland
Holland
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Johor Bahru
(可能来自未被清除的列表)