我尝试使用python编写python程序以json文件格式(不包括转发)在推文中提取文本。以下是python中的snippcode(文件很大20MB,所以这里不包括在内)。
import sys
import difflib
import twitter
import json
from pprint import pprint
# Input argument is the filename of the JSON ascii file from the Twitter API
filename = sys.argv[1]
tweets_text = [] # We will store the text of every tweet in this list
tweets_location = [] # Location of every tweet (free text field - not always `enter code here`accurate or given)
tweets_timezone = [] # Timezone name of every tweet
# Loop over all lines
f = file(filename, "r")
lines = f.readlines()
for line in lines:
try:
tweet = json.loads(line)
# Ignore retweets!
if (tweet[1].has_key('retweeted_status') or not ( tweet[1].has_key('text'))):
continue
# Fetch text from tweet
text = tweet[1]['text'].encode('utf-8','ignore').lower()
# Ignore 'manual' retweets, i.e. messages starting with RT
if text.find("RT ") > -1:
continue
tweets_text.append( text )
tweets_location.append( tweet[1]['user']['location'].encode('utf-8','ignore') )
tweets_timezone.append( tweet[1]['user']['time_zone'].encode('utf-8','ignore') )
except ValueError:
pass
# Show result
print tweets_text
问题是我只收到一条推文。任何人都可以指出错误吗?