0

我编写了一个脚本来计算词汇多样性和一些其他有意义的统计数据。我的问题是,在某些文件上,当它遇到我只能假设是错误的 json 行时它会失败。我的每个数据文件都包含 json 字符串,每个字符串都有自己的行。这些计算有意义的字段是“文本”字段。

我的代码:

import fileinput
import json
import sys
import os
from collections import defaultdict

line = []                                                       # set to list
tw = 0                                                          # set total words to int
tuw = 0                                                         # set total unique words to int
lexd = 0                                                        # set total lexical diversity to int
awpt = 0                                                        # set average words per tweet to int
line_counter = 0

inputfilename = sys.argv[1]                                     # read the first system argument as the input file name

word_count = defaultdict(int)                                   # set word_count to the default dictionary

for line in fileinput.input([inputfilename]):                   # FOR each line in the input file
        line = line.strip();                                            # strip any blank lines and throw them out
        if not line: continue                                           # if the file does contain a blank line still: in the case of EOF then continue
        tweettext = json.loads(line).get('text')                        # load the line with json.loads and get the "text" field
        if not json.loads(line).get('text'): continue                   # if the line does not contain json data then continue
        words = tweettext.split()                                       # split the words from the single line into individual dicts
        tw += len(words)                                                # total words counter
        line_counter += 1                                               # total lines counter
        print line_counter                                              # so we know what line we're on
        for word in words:                                              # FOR each word in the individual line "text" corpus
                word_count[word]+=1                                             # Take the word_count dict, insert the words and incriment

tuw = len(set(word_count))                                      # calculate the total number of unique words
lexd += 1.0*tuw/tw                                              # calculate the lexical diversity
awpt = 1.0*tuw/line_counter                                     # calc average number of words per tweet

print word_count                                                # print the word list dictionary
print "total number of words", tw                               # print the total number of words
print "total uniq words", tuw                                   # print the total number of unique words
print "total corpus lexical diversity", lexd                    # print the total lexical diversity of the entire corpus
print "average number of words per tweet", awpt                 # print the average number of words per tweet

样本数据:

{"favorited": false, "in_reply_to_user_id": 213741147, "contributors": null, "truncated": false, "text": "@Rafinha_Angelo sim sim, manda o print l\u00e1 HUSAHUS!", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": "169216950453542912", "coordinates": null, "in_reply_to_user_id_str": "213741147", "entities": {"user_mentions": [{"indices": [0, 15], "screen_name": "Rafinha_Angelo", "id": 213741147, "name": "Rafael A. Figueiredo", "id_str": "213741147"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": 169216950453542912, "id_str": "169217034821976067", "in_reply_to_screen_name": "Rafinha_Angelo", "user": {"follow_request_sent": null, "profile_use_background_image": true, "profile_background_image_url_https": "https://si0.twimg.com/images/themes/theme9/bg.gif", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1769152407/223_normal.JPG", "profile_sidebar_fill_color": "252429", "is_translator": false, "id": 67115876, "profile_text_color": "666666", "followers_count": 310, "profile_sidebar_border_color": "181A1E", "location": "Somewhere.", "default_profile_image": false, "listed_count": 0, "utc_offset": -10800, "statuses_count": 6027, "description": "it's like one more day, with no more things !", "friends_count": 106, "profile_link_color": "2FC2EF", "profile_image_url": "http://a2.twimg.com/profile_images/1769152407/223_normal.JPG", "notifications": null, "show_all_inline_media": false, "geo_enabled": true, "profile_background_color": "1A1B1F", "id_str": "67115876", "profile_background_image_url": "http://a1.twimg.com/images/themes/theme9/bg.gif", "screen_name": "Guiii_Fernandes", "lang": "en", "profile_background_tile": false, "favourites_count": 112, "name": "Guilherme Fernandes", "url": "http://facebook.com/GuiiFernandes", "created_at": "Wed Aug 19 20:43:05 +0000 2009", "contributors_enabled": false, "time_zone": "Brasilia", "protected": false, "default_profile": false, "following": null}, "place": null, "retweet_count": 0, "geo": null, "id": 169217034821976067, "source": "web"}
{"favorited": false, "in_reply_to_user_id": null, "contributors": null, "retweeted_status": {"favorited": false, "in_reply_to_user_id": null, "contributors": null, "truncated": false, "text": "On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Borat voice). Xoxo, JM", "created_at": "Mon Feb 13 23:27:08 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169200965151494144", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 69751644, "description": "", "verified": true, "profile_image_url_https": "https://si0.twimg.com/profile_images/387138234/1_normal.jpg", "profile_sidebar_fill_color": "5c5c5c", "is_translator": false, "geo_enabled": false, "profile_text_color": "333333", "followers_count": 473162, "profile_sidebar_border_color": "00e35f", "id_str": "69751644", "default_profile_image": false, "location": "Los Angeles", "utc_offset": -28800, "statuses_count": 5380, "profile_background_color": "00e35f", "friends_count": 10730, "profile_link_color": "05bcff", "profile_image_url": "http://a0.twimg.com/profile_images/387138234/1_normal.jpg", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/72720138/green.jpg", "profile_background_image_url": "http://a0.twimg.com/profile_background_images/72720138/green.jpg", "screen_name": "jamesmaslow", "lang": "en", "profile_background_tile": false, "favourites_count": 1, "name": "james maslow", "url": "http://www.JamesMaslow.com", "created_at": "Sat Aug 29 01:32:02 +0000 2009", "contributors_enabled": false, "time_zone": "Pacific Time (US & Canada)", "protected": false, "default_profile": false, "following": null, "listed_count": 8348}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169200965151494144, "source": "<a href=\"http://www.osfoora.com\" rel=\"nofollow\">Osfoora for iPhone</a>"}, "truncated": true, "text": "RT @jamesmaslow: On the stage in Vegas for the last few days of rehearsal...this is epic! Going to be a huge show. I like! (said in Bora ...", "created_at": "Tue Feb 14 00:30:59 +0000 2012", "retweeted": false, "in_reply_to_status_id_str": null, "coordinates": null, "in_reply_to_user_id_str": null, "entities": {"user_mentions": [{"indices": [3, 15], "id_str": "69751644", "id": 69751644, "name": "james maslow", "screen_name": "jamesmaslow"}], "hashtags": [], "urls": []}, "in_reply_to_status_id": null, "id_str": "169217034817765377", "place": null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "id": 466873377, "description": "Totally dedicate for @1LoganHenderson MINE perfect BTBoy!!!! *--* Rusher for the infinity and beyond and much more beyond!!! Since 01/17/12 =*", "verified": false, "profile_image_url_https": "https://si0.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "profile_sidebar_fill_color": "940a2d", "is_translator": false, "geo_enabled": false, "profile_text_color": "eb4466", "followers_count": 103, "profile_sidebar_border_color": "d61153", "id_str": "466873377", "default_profile_image": false, "location": "", "utc_offset": -7200, "statuses_count": 3730, "profile_background_color": "070808", "friends_count": 154, "profile_link_color": "de243d", "profile_image_url": "http://a2.twimg.com/profile_images/1781262618/_20120126_023206_424_normal.gif", "notifications": null, "show_all_inline_media": false, "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "profile_background_image_url": "http://a3.twimg.com/profile_background_images/421884815/tumblr_lz7316OE041rnvmm7o1_500.jpg", "screen_name": "Logiehbear", "lang": "en", "profile_background_tile": true, "favourites_count": 209, "name": "BBFFF da Laryh!!", "url": null, "created_at": "Tue Jan 17 21:53:17 +0000 2012", "contributors_enabled": false, "time_zone": "Mid-Atlantic", "protected": false, "default_profile": false, "following": null, "listed_count": 1}, "in_reply_to_screen_name": null, "retweet_count": 465, "geo": null, "id": 169217034817765377, "source": "web"}

脚本输出:

1
2
defaultdict(<type 'int'>, {u'be': 1, u'is': 1, u'Going': 1, u'in': 2, u'I': 1, u'(said': 1, u'RT': 1, u'huge': 1, u'for': 1, u'l\xe1': 1, u'few': 1, u'Vegas': 1, u'manda': 1, u'print': 1, u'sim,': 1, u'sim': 1, u'On': 1, u'to': 1, u'like!': 1, u'HUSAHUS!': 1, u'rehearsal...this': 1, u'@jamesmaslow:': 1, u'...': 1, u'epic!': 1, u'stage': 1, u'a': 1, u'show.': 1, u'last': 1, u'of': 1, u'days': 1, u'o': 1, u'@Rafinha_Angelo': 1, u'the': 2, u'Bora': 1})
total number of words 36
total uniq words 34
total corpus lexical diversity 0.944444444444
average number of words per tweet 17.0

这实际上运行得非常快,但是在我的一些数据集上,在几千行之后它会失败:

Traceback (most recent call last):
  File "lex.py", line 21, in <module>
    tweettext = json.loads(line).get('text')                        # load the line with json.loads and get the "text" field
  File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
    return _default_decoder.decode(s)
  File "/usr/lib64/python2.7/json/decoder.py", line 366, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib64/python2.7/json/decoder.py", line 382, in raw_decode
    obj, end = self.scan_once(s, idx)
ValueError: Unterminated string starting at: line 1 column 1531 (char 1531)

如果脚本挂断了它正在读取的行的格式,那么我想跳过该行并继续前进。有什么建议么?

4

1 回答 1

1

我已经使用 Jesse Harris 的解决方案解决了这个问题,如果 json.loads 出现错误,则包含异常。

import fileinput
import json
import sys
import os
from collections import defaultdict

line = []                                                     
tw = 0                                                        
tuw = 0                                                        
lexd = 0                                                       
awpt = 0                                                       
line_counter = 0

inputfilename = sys.argv[1]                                   

word_count = defaultdict(int)                                 

for line in fileinput.input([inputfilename]):                  
        line = line.strip();                                            
        if not line: continue   
        try:         
               tweettext = json.loads(line).get('text')                       
               if not json.loads(line).get('text'): continue                   
               words = tweettext.split()                                       
               tw += len(words)                                                
               line_counter += 1                                               
               print line_counter                                              
               for word in words:                                              
                      word_count[word]+=1                                             
        except:
               print "Problem Line: " + line

tuw = len(set(word_count))                                      
lexd += 1.0*tuw/tw                                             
awpt = 1.0*tuw/line_counter                                     

# print word_count                                               
print "total number of words", tw                              
print "total uniq words", tuw                                  
print "total corpus lexical diversity", lexd                   
print "average number of words per tweet", awpt                

When I ran this against my data it resulted in the print out of a gzip'd line of data. As I mentioned in my previous comment this was due to switching to the gzip streaming API from twitter. Two thumbs up to @jesseharris

于 2012-06-03T05:52:05.747 回答