我有一个文件,其中包含从 python-tweetstreamer 提取的数百行 json 编码的推文。这些行看起来像:
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "kasian pak weking :| RT @veNikenD: Kasian kenapa???RT @SaputraJordhy: kasian \u256e(\u256f_\u2570)\u256d RT @veNikenD: Tak ingin lg kudengar kata2 yg tak ......", "created_at": "Tue Apr 03 14:07:59 +0000 2012", "retweeted": false, "in_reply_to_status_id": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [24, 33], "screen_name": "veNikenD", "id": 64910664, "name": "Ve Damayanti", "id_str": "64910664"}, {"indices": [54, 68], "screen_name": "SaputraJordhy", "id": 414675856, "name": "jordhy_ynwa", "id_str": "414675856"}, {"indices": [88, 97], "screen_name": "veNikenD", "id": 64910664, "name": "Ve Damayanti", "id_str": "64910664"}], "hashtags": [], "urls": []}, "in_reply_to_status_id_str": null, "id_str": "187179645026836481", "in_reply_to_screen_name": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/418679759/Young_Minato_and_Kushina_by_HaNa7.jpg", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1953774147/Untitled1_normal.png", "profile_sidebar_fill_color": "DDEEF6", "is_translator": false, "id": 414675856, "profile_text_color": "1c181c", "followers_count": 46, "protected": false, "location": "", "default_profile_image": false, "listed_count": 0, "utc_offset": 25200, "statuses_count": 409, "description": "never walk alone", "friends_count": 76, "profile_link_color": "0084B4", "profile_image_url": "http://a0.twimg.com/profile_images/1953774147/Untitled1_normal.png", "notifications": null, "show_all_inline_media": false, "geo_enabled": false, "profile_background_color": "C0DEED", "id_str": "414675856", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/418679759/Young_Minato_and_Kushina_by_HaNa7.jpg", "screen_name": "SaputraJordhy", "lang": "id", "profile_background_tile": true, "favourites_count": 0, "name": "jordhy_ynwa", "url": null, "created_at": "Thu Nov 17 10:41:05 +0000 2011", "contributors_enabled": false, "time_zone": "Jakarta", "profile_sidebar_border_color": "C0DEED", "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 187179645026836481, "source": "<a href=\"https://embr.in\" rel=\"nofollow\">embr</a>"}
{"favorited": false, "in_reply_to_user_id": 441527150, "contributors": null, "truncated": false, "text": "@akoriko1046 \u5bdd\u308b\u306e\uff1f\u3000\u5f85\u3063\u3066\u50d5\u3082\u884c\u304f\u3088\u2026\u5e03\u56e3\u307e\u3067\u304a\u59eb\u69d8\u62b1\u3063\u3053\u3057\u3066\u3044\u3063\u3066\u3042\u3052\u308b", "created_at": "Tue Apr 03 14:07:59 +0000 2012", "retweeted": false, "in_reply_to_status_id": 187179532103598080, "coordinates": null, "in_reply_to_user_id_str": "441527150", "entities": {"user_mentions": [{"indices": [0, 12], "screen_name": "akoriko1046", "id": 441527150, "name": "\u30a2\u30b3\u30ea\u30b3", "id_str": "441527150"}], "hashtags": [], "urls": []}, "in_reply_to_status_id_str": "187179532103598080", "id_str": "187179645014253568", "in_reply_to_screen_name": "akoriko1046", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/440543906/122004876_org.jpg", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1870697453/__________bg_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "is_translator": false, "id": 513679998, "profile_text_color": "333333", "followers_count": 169, "protected": false, "location": "\u3042\u306a\u305f\u306e\u96a3", "default_profile_image": false, "listed_count": 2, "utc_offset": 32400, "statuses_count": 6024, "description": "\u8584\u685c\u9b3c\u6c96\u7530\u7dcf\u53f8\u306e\u975e\u516c\u5f0fbot\u3067\u3059\u7518\u7518/\u30a8\u30ed\u8a2d\u5b9a\u3000\uff8c\uff6b\uff9b\uff70\u306e\u518d\u306f\u5fc5\u305a\u8aac\u660e\u66f8\u3092\u4e00\u8aad\u4e0b\u3055\u3044http://www.pixiv.net/novel/show.php?id=934499 \u624b\u52d5\u3067\u30d5\u30a9\u30ed\u8fd4\u3057\u3092\u884c\u3063\u3066\u307e\u3059\u3000\u7a00\u306b\u4e2d\u306b\u7ba1\u7406\u4eba\u304c\u3044\u307e\u3059\u3000\u7ba1\u7406\u4eba@akanemam1 18\u7981\u7dcf\u53f8\u2192 @sou_oki_18bot", "friends_count": 166, "profile_link_color": "0084B4", "profile_image_url": "http://a0.twimg.com/profile_images/1870697453/__________bg_normal.jpg", "notifications": null, "show_all_inline_media": false, "geo_enabled": false, "profile_background_color": "C0DEED", "id_str": "513679998", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/440543906/122004876_org.jpg", "screen_name": "sou_oki_bot", "lang": "ja", "profile_background_tile": false, "favourites_count": 1, "name": "\u7dcf\u53f8(bot)", "url": null, "created_at": "Sat Mar 03 22:36:15 +0000 2012", "contributors_enabled": false, "time_zone": "Irkutsk", "profile_sidebar_border_color": "C0DEED", "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 187179645014253568, "source": "<a href=\"http://twittbot.net/\" rel=\"nofollow\">twittbot.net</a>"}
{"favorited": false, "in_reply_to_user_id": 141448885, "contributors": null, "truncated": false, "text": "@nobuttu3 \u6642\u9593\u304c\u904e\u304e\u308b\u306e\u304c\u7269\u51c4\u304f\u65e9\u3044\u3067\u3059\u3088\u306d\u2026", "created_at": "Tue Apr 03 14:07:59 +0000 2012", "retweeted": false, "in_reply_to_status_id": 187179547098234880, "coordinates": null, "in_reply_to_user_id_str": "141448885", "entities": {"user_mentions": [{"indices": [0, 9], "screen_name": "nobuttu3", "id": 141448885, "name": "\u306e\u4ecf \uf8ff \u30bf\u30ab\u30cf\u30b7", "id_str": "141448885"}], "hashtags": [], "urls": []}, "in_reply_to_status_id_str": "187179547098234880", "id_str": "187179645047799808", "in_reply_to_screen_name": "nobuttu3", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/424878981/tw_hvt_nz.jpg", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1223811261/twitter_icon_normal.png", "profile_sidebar_fill_color": "daecf4", "is_translator": false, "id": 97481308, "profile_text_color": "663B12", "followers_count": 436, "protected": false, "location": "\u6771\u4eac\u90fd\u53f0\u6771\u533a", "default_profile_image": false, "listed_count": 20, "utc_offset": 32400, "statuses_count": 63704, "description": "\u591a\u5206PG\u3001\u6642\u3005SE\u307d\u3044\u4ed5\u4e8b\u3092\u3057\u3066\u3044\u307e\u3059\u3002\u30e9\u30ce\u30d9\u597d\u304d\u3001\u97f3\u697d\u597d\u304d(\u7279\u5b9a\u306e\u5206\u91ce\u3067\u3059\u304c)\u3002\u30bd\u30b3\u30bd\u30b3\u306e\u983b\u5ea6\u3067\u79cb\u8449\u539f\u306b\u3044\u305f\u308a\u3082\u3057\u307e\u3059\u3002 ", "friends_count": 896, "profile_link_color": "1F98C7", "profile_image_url": "http://a0.twimg.com/profile_images/1223811261/twitter_icon_normal.png", "notifications": null, "show_all_inline_media": false, "geo_enabled": false, "profile_background_color": "ffffff", "id_str": "97481308", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/424878981/tw_hvt_nz.jpg", "screen_name": "xi6", "lang": "ja", "profile_background_tile": false, "favourites_count": 4473, "name": "\u3055\u304f", "url": null, "created_at": "Thu Dec 17 16:55:25 +0000 2009", "contributors_enabled": false, "time_zone": "Tokyo", "profile_sidebar_border_color": "C6E2EE", "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 187179645047799808, "source": "<a href=\"http://tapbots.com/tweetbot\" rel=\"nofollow\">Tweetbot for iOS</a>"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "#ImSingleBecause lolz I'm not. Happily taken by @GarrettBettler <33 I love him, forever :)", "created_at": "Tue Apr 03 14:07:59 +0000 2012", "retweeted": false, "in_reply_to_status_id": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [48, 63], "screen_name": "GarrettBettler", "id": 460816116, "name": "Garrett Bettler", "id_str": "460816116"}], "hashtags": [{"indices": [0, 16], "text": "ImSingleBecause"}], "urls": []}, "in_reply_to_status_id_str": null, "id_str": "187179645039427584", "in_reply_to_screen_name": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/452188318/tanja_beach_2007_001.JPG", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1971847266/image_normal.jpg", "profile_sidebar_fill_color": "f6ffd1", "is_translator": false, "id": 461432420, "profile_text_color": "333333", "followers_count": 222, "protected": false, "location": "", "default_profile_image": false, "listed_count": 0, "utc_offset": null, "statuses_count": 2334, "description": "", "friends_count": 192, "profile_link_color": "0099CC", "profile_image_url": "http://a0.twimg.com/profile_images/1971847266/image_normal.jpg", "notifications": null, "show_all_inline_media": false, "geo_enabled": false, "profile_background_color": "FFF04D", "id_str": "461432420", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/452188318/tanja_beach_2007_001.JPG", "screen_name": "LeahOswalt", "lang": "en", "profile_background_tile": false, "favourites_count": 86, "name": "Leah Oswalt", "url": null, "created_at": "Wed Jan 11 20:07:24 +0000 2012", "contributors_enabled": false, "time_zone": null, "profile_sidebar_border_color": "fff8ad", "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 187179645039427584, "source": "<a href=\"http://twitter.com/#!/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>"}
{"favorited": false, "in_reply_to_user_id": 434884235, "contributors": null, "truncated": false, "text": "@nomimushi_ttk \u3068\u30fc\u3084\u3082\u7d20\u6575\u3060\u3051\u3069\u306e\u307f\u3080\u3057\u306e\u30a2\u30a4\u30b3\u30f3\u5929\u4f7f\u3059\u304e\u3066", "created_at": "Tue Apr 03 14:07:59 +0000 2012", "retweeted": false, "in_reply_to_status_id": 187179241664815105, "coordinates": null, "in_reply_to_user_id_str": "434884235", "entities": {"user_mentions": [{"indices": [0, 14], "screen_name": "nomimushi_ttk", "id": 434884235, "name": "\u306e\u307f\u3080\u3057", "id_str": "434884235"}], "hashtags": [], "urls": []}, "in_reply_to_status_id_str": "187179241664815105", "id_str": "187179645026836480", "in_reply_to_screen_name": "nomimushi_ttk", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme1/bg.png", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/2023688835/6222d8ae387e6fb5b220895d2fd2d41a_normal.gif", "profile_sidebar_fill_color": "DDEEF6", "is_translator": false, "id": 365471550, "profile_text_color": "333333", "followers_count": 308, "protected": false, "location": "\u5b66\u5712\u30a2\u30ea\u30b9\u306b\u518d\u71b1\u306a\u3046", "default_profile_image": false, "listed_count": 17, "utc_offset": 32400, "statuses_count": 25562, "description": "\uff8c\uff9e\uff9a10/\u3046\u305f\u30d7\u30ea/HTF/\u3044\u306c\u307c\u304f\u306a\u3069\u306b\u304a\u71b1/\u5d50\u306e\u5927\u91ce\u304f\u3093\u3059\u304d\uff01\u64ec\u4eba\u5316\u3082\u3050\u3082\u3050/\u30a4\u30ca\u30a4\u30ec/RKRN/pkmn/\uff83\uff86\uff8c\uff9f\uff98/\u4e59\u5973\uff79\uff9e\uff70\u5168\u822c\u3082 [\u30bf\u30ab\u4e38\u3055\u3093\u30e2\u30b0\u30e2\u30b0\u30da\u30c3\u3063\u3066\u3057\u968a\u54e1No.2\uff3c\u526f\u968a\u9577\uff0f]\u3000\u898f\u5236\u57a2\u3010@ao_sanagi_2\u3011\u30a2\u30a4\u30b3\u30f3\u306f\u3068\u30fc\u3084\u304b\u3089\uff01", "friends_count": 284, "profile_link_color": "0084B4", "profile_image_url": "http://a0.twimg.com/profile_images/2023688835/6222d8ae387e6fb5b220895d2fd2d41a_normal.gif", "notifications": null, "show_all_inline_media": false, "geo_enabled": false, "profile_background_color": "C0DEED", "id_str": "365471550", "profile_background_image_url": "http://a0.twimg.com/images/themes/theme1/bg.png", "screen_name": "ao_sanagi", "lang": "ja", "profile_background_tile": false, "favourites_count": 1071, "name": "\u8475@\u6284\u82b1\u306e\u5ac1", "url": null, "created_at": "Wed Aug 31 14:07:23 +0000 2011", "contributors_enabled": false, "time_zone": "Tokyo", "profile_sidebar_border_color": "C0DEED", "default_profile": true, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 187179645026836480, "source": "<a href=\"http://www.movatwi.jp\" rel=\"nofollow\">\u30e2\u30d0\u30c4\u30a4 / www.movatwi.jp .</a>"}
我的最终目标是计算特定单词在所有推文的“文本”字段中出现的次数。我尝试了多种不同的方法,取得了不同程度的成功,但我现在所处的位置是:
import fileinput
import json
import sys
import os
line = []
inputfilename = sys.argv[1]
for line in fileinput.input([inputfilename]):
tweettext = json.loads(line).get('text').split()
print tweettext
这将遍历文件并将文本拆分为每行“文本”字段中的单个单词,但不会创建单个单词列表。要在遇到空行时添加问题,它会失败:
[u'RT', u'@keenakan:', u'kamu', u'tidak', u'perlu', u'memperjuangkan', u'aku.', u'Yang', u'perlu', u'ialah', u'aku', u'dan', u'kamu', u'yang', u'memperjuangkan', u'kita.', u'-@commaditya']
[u'RT', u'@TheRealToxicBoi:', u'#LiesBeforeSex', u"I'll", u'be', u'Gentle!']
[u'@coliriostar', u'Quer', u'GANHAR', u'R$', u'300,00', u'em', u'vale', u'compra?', u'SIGA', u'@eucompronanet', u'e', u'saiba', u'como', u'participar,', u'\xe9', u'simples', u'e', u'r\xe1pido!', u'at\xe9', u'+', u'ci']
Traceback (most recent call last):
File "newexample.py", line 11, in <module>
tweettext = json.loads(line).get('text').split()
AttributeError: 'NoneType' object has no attribute 'split'
任何人都可以提出解决方案吗?
编辑:
根据第一条评论,我根据自己的理解将代码编辑如下:
import fileinput
import json
import sys
import os
line = []
tw = 0
inputfilename = sys.argv[1]
for line in fileinput.input([inputfilename]):
line = line.strip();
if not line: continue
tweettext = json.loads(line).get('text')
if not json.loads(line).get('text'):
continue
words = tweettext.split()
print words
tw = len(words)
print "total number of words", tw
我的输出看起来更好,至少我不再收到“属性错误:NoneType”。现在输出似乎由单个字典组成,而不仅仅是一个大字典。同样,我的目标是计算每个单词出现的次数,除非它们都在一个字典中,否则我不确定该怎么做。这是此时的输出示例:
[u'L', u'Lawliet', u'(Sweets', u'Addict)', u'+', u'Kenshin', u'Himura', u'(Samurai)', u'+', u'Kyon', u'(Lazy', u'and', u'Carefree', u'Bum)', u'=', u'Sakata', u'Gintoki', u'xD', u'May...', u'http://t.co/LD4E1j1v']
[u'Yay', u'~', u'I', u'have', u'ice~I', u'can', u'reach', u'the', u'ice', u'maker!', u'ch', u'sees', u'gaps', u'in', u'the', u'freezer', u'as', u'a', u'challenge', u'and', u"it's", u'usually', u'full', u'to', u'busting.', u'But', u'not', u'now', u'Haha!']
[u'Hoi']
[u'everyones', u'on', u'twitter.']
total number of words 429023
我想我可能会以某种方式为 for 循环中的每个单词设置计数器。?如您所见,总字数工作正常,因为它增加了每行的字数,但我不太清楚如何确定唯一字,例如:
len(set(words))
编辑:
这是我的最终解决方案:
import fileinput
import json
import sys
import os
from collections import defaultdict
line = []
tw = 0
inputfilename = sys.argv[1]
word_count = defaultdict(int)
for line in fileinput.input([inputfilename]):
line = line.strip();
if not line: continue
tweettext = json.loads(line).get('text')
if not json.loads(line).get('text'):
continue
words = tweettext.split()
tw += len(words)
for word in words:
word_count[word]+=1
print word_count
print "total number of words", tw