1

我有一个包含所有元数据的一堆推文的 .txt 文件。

我正在尝试将它们读入一个名为 twitter 的 python sqlite db。我想分离出推文的每个属性(ID、创建时间、文本、位置等)并将每个属性放入自己的字段中。

我已经在 Python sqlite 中创建了表。

import sqlite3
conn = sqlite3.connect('twitter.db')
c = conn.cursor()

st='''CREATE TABLE Tweet
 (
    created_at VARCHAR2(25),
    id VARCHAR2(25),
    text VARCHAR2(25),
    source VARCHAR2(25), 
    in-reply_to_user_ID VARCHAR2(25), 
    retweet_Count VARCHAR2(25)

 ); '''

 c.execute(st)

 lineArray=open("file.txt").readlines()
 for elt in lineArray:
     print elt

lineArray[0][:-1].split(', ')

#Loads variables 
 for elt in lineArray:
    currentRow = elt[:-1].split(", ")
     insert = """insert into Tweet values ('%s', '%s', '%s', %s, %s, %s)""" %("created_at", "id",     "text", 'source', 'in-reply_to_user_ID', 'retweet_Count')
print insert
c.execute

有人还建议我为此使用 numpy - 但也不确定如何使用。

每条推文如下所示:

{"created_at":"Fri Oct 11 00:00:03 +0000 2013",
"id":388453908911095800,
"id_str":"388453908911095809",
"text":"LAGI PUN VISITORS DATANG PUKUL 9 AH",
"source":"<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>",
"truncated":false,
"in_reply_to_status_id":null,
"in_reply_to_status_id_str":null,
"in_reply_to_user_id":null,
"in_reply_to_user_id_str":null,
"in_reply_to_screen_name":null,
"user":{
    "id":447800506,
    "id_str":"447800506",
    "name":"§yazwina·",
    "screen_name":"_SAireen",
    "location":"SSP",
    "url":"http://flavors.me/syazwinaaireen#",
    "description":"Absence makes the heart grow fonder. Stay us x @_DFitri's",
    "protected":false,
    "followers_count":806,
    "friends_count":702,
    "listed_count":2,
    "created_at":"Tue Dec 27 08:29:53 +0000 2011",
    "favourites_count":7478,
    "utc_offset":28800,
    "time_zone":"Beijing",
    "geo_enabled":true,
    "verified":false,
    "statuses_count":32558,
    "lang":"en",
    "contributors_enabled":false,
    "is_translator":false,
    "profile_background_color":"DBE9ED",
    "profile_background_image_url":"http://a0.twimg.com/profile_background_images/378800000056283804/65d84665fbb81deba13427e8078a3eff.png",
    "profile_background_image_url_https":"https://si0.twimg.com/profile_background_images/378800000056283804/65d84665fbb81deba13427e8078a3eff.png",
    "profile_background_tile":true,
    "profile_image_url":"http://a0.twimg.com/profile_images/378800000264138431/fd9d57bd1b1609f36fd7159499a94b6e_normal.jpeg",
    "profile_image_url_https":"https://si0.twimg.com/profile_images/378800000264138431/fd9d57bd1b1609f36fd7159499a94b6e_normal.jpeg",
    "profile_banner_url":"https://pbs.twimg.com/profile_banners/447800506/1369969522",
    "profile_link_color":"FA0096",
    "profile_sidebar_border_color":"FFFFFF",
    "profile_sidebar_fill_color":"E6F6F9",
    "profile_text_color":"333333",
    "profile_use_background_image":true,
    "default_profile":false,
    "default_profile_image":false,
    "following":null,
    "follow_request_sent":null,
    "notifications":null
    },
"geo":null,
"coordinates":null,
"place":null,
"contributors":null,
"retweet_count":0,
"favorite_count":0,
"entities":{
    "hashtags":[],
    "symbols":[],
    "urls":[],
    "user_mentions":[]
    },
"favorited":false,
"retweeted":false,
"filter_level":"medium",
"lang":"it"}
4

0 回答 0