我有一个包含所有元数据的一堆推文的 .txt 文件。
我正在尝试将它们读入一个名为 twitter 的 python sqlite db。我想分离出推文的每个属性(ID、创建时间、文本、位置等)并将每个属性放入自己的字段中。
我已经在 Python sqlite 中创建了表。
import sqlite3
conn = sqlite3.connect('twitter.db')
c = conn.cursor()
st='''CREATE TABLE Tweet
(
created_at VARCHAR2(25),
id VARCHAR2(25),
text VARCHAR2(25),
source VARCHAR2(25),
in-reply_to_user_ID VARCHAR2(25),
retweet_Count VARCHAR2(25)
); '''
c.execute(st)
lineArray=open("file.txt").readlines()
for elt in lineArray:
print elt
lineArray[0][:-1].split(', ')
#Loads variables
for elt in lineArray:
currentRow = elt[:-1].split(", ")
insert = """insert into Tweet values ('%s', '%s', '%s', %s, %s, %s)""" %("created_at", "id", "text", 'source', 'in-reply_to_user_ID', 'retweet_Count')
print insert
c.execute
有人还建议我为此使用 numpy - 但也不确定如何使用。
每条推文如下所示:
{"created_at":"Fri Oct 11 00:00:03 +0000 2013",
"id":388453908911095800,
"id_str":"388453908911095809",
"text":"LAGI PUN VISITORS DATANG PUKUL 9 AH",
"source":"<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>",
"truncated":false,
"in_reply_to_status_id":null,
"in_reply_to_status_id_str":null,
"in_reply_to_user_id":null,
"in_reply_to_user_id_str":null,
"in_reply_to_screen_name":null,
"user":{
"id":447800506,
"id_str":"447800506",
"name":"§yazwina·",
"screen_name":"_SAireen",
"location":"SSP",
"url":"http://flavors.me/syazwinaaireen#",
"description":"Absence makes the heart grow fonder. Stay us x @_DFitri's",
"protected":false,
"followers_count":806,
"friends_count":702,
"listed_count":2,
"created_at":"Tue Dec 27 08:29:53 +0000 2011",
"favourites_count":7478,
"utc_offset":28800,
"time_zone":"Beijing",
"geo_enabled":true,
"verified":false,
"statuses_count":32558,
"lang":"en",
"contributors_enabled":false,
"is_translator":false,
"profile_background_color":"DBE9ED",
"profile_background_image_url":"http://a0.twimg.com/profile_background_images/378800000056283804/65d84665fbb81deba13427e8078a3eff.png",
"profile_background_image_url_https":"https://si0.twimg.com/profile_background_images/378800000056283804/65d84665fbb81deba13427e8078a3eff.png",
"profile_background_tile":true,
"profile_image_url":"http://a0.twimg.com/profile_images/378800000264138431/fd9d57bd1b1609f36fd7159499a94b6e_normal.jpeg",
"profile_image_url_https":"https://si0.twimg.com/profile_images/378800000264138431/fd9d57bd1b1609f36fd7159499a94b6e_normal.jpeg",
"profile_banner_url":"https://pbs.twimg.com/profile_banners/447800506/1369969522",
"profile_link_color":"FA0096",
"profile_sidebar_border_color":"FFFFFF",
"profile_sidebar_fill_color":"E6F6F9",
"profile_text_color":"333333",
"profile_use_background_image":true,
"default_profile":false,
"default_profile_image":false,
"following":null,
"follow_request_sent":null,
"notifications":null
},
"geo":null,
"coordinates":null,
"place":null,
"contributors":null,
"retweet_count":0,
"favorite_count":0,
"entities":{
"hashtags":[],
"symbols":[],
"urls":[],
"user_mentions":[]
},
"favorited":false,
"retweeted":false,
"filter_level":"medium",
"lang":"it"}