所以我有这段代码,它需要 mbox 文件,然后将它们转换为 sqlite 数据库。但是,我得到一个空数据库。无论我使用哪个 mbox,生成的数据库都是 12kb。当我尝试查看数据库/分析它时,结果是什么都没有,没有任何信息或任何东西,尽管已经制作了表和键。但里面没有信息。可能是什么问题呢?是脚本没有选择mbox,还是循环内部有问题?
import mailbox
import os
import email
from bs4 import BeautifulSoup
import sqlite3
# Current working directory
cwd = os.path.dirname(os.path.realpath(__file__))
# First find all the mbox files using os.walk
mbox_path = cwd + '/mbox_files'
mbox_files = []
pattern = '*.mbox'
for root, dirs, files in os.walk(mbox_path):
for filename in fnmatch.filter(files, pattern):
mbox_files.append((filename, os.path.join(root, filename)))
# Now process each message in the folder
for mbox_file in mbox_files:
src_mbox = mailbox.mbox(mbox_file[1])
for msg in src_mbox:
sender = name_email(msg['From'])
recipient = name_email(msg['To'])
b = email.message_from_string(str(msg))
if b.is_multipart():
for payload in b.get_payload():
p = payload.get_payload()
if isinstance(p,list):
html_text = p[0]
else:
html_text = p
try:
# Remove any HTML tags, and any inline styles
soup = BeautifulSoup(str(html_text))
[s.extract() for s in soup('style')]
text = soup.text.strip()
except:
pass
# Just in case we get a plain text email
else:
text = b.get_payload()
row = [
None,
sender[0],
sender[1],
recipient[0],
recipient[1],
msg['Subject'],
topic,
msg['Date'],
msg['Message-ID'],
text
]
cur.execute("INSERT INTO emails VALUES(?,?,?,?,?,?,?,?,?,?);", row)
# A litle utility function that separates name and email from strings like '"Some Name" <some@email.com>'
def name_email(s):
if not s:
return None, None
pieces = s.split('<')
if len(pieces) > 1:
name = pieces[0].replace('"','').strip()
email = pieces[1].replace('>','')
else:
name = None
email = pieces[0].replace('>','')
return name,email
conn = sqlite3.connect(cwd + '/test.db')
cur = conn.cursor()
# Create the table.
cur.execute("DROP TABLE IF EXISTS emails")
cur.execute("CREATE TABLE emails(id INTEGER PRIMARY KEY, sender_name TEXT, sender_email TEXT, recipient_name TEXT, recipient_email TEXT, subject TEXT, conversation_topic TEXT, message_date TEXT, message_id TEXT, text_body TEXT)")
cur.execute("CREATE INDEX index_sender_name ON emails (sender_name)")