将 outlook.msg 正文提取到文本文件
def getEmailBodyFromMsg():
mapi.MAPIInitialize ((mapi.MAPI_INIT_VERSION, 0))
storage_flags = win32com.storagecon.STGM_DIRECT | win32com.storagecon.STGM_READ | win32com.storagecon.STGM_SHARE_EXCLUSIVE
filepathList = glob.glob('*.msg')
for filepath in filepathList :
txtFilepath = os.path.splitext(ntpath.basename(filepath))[0]
resultFile = txtFilepath + datetime.now().strftime('%Y-%m-%d %H_%M_%S')+".txt"
#get body of email and save as txt
storage = pythoncom.StgOpenStorage (filepath, None, storage_flags, None, 0)
mapi_session = mapi.OpenIMsgSession ()
message = mapi.OpenIMsgOnIStg (mapi_session, None, storage, None, 0, mapi.MAPI_UNICODE)
#write to txt file
CHUNK_SIZE = 10000
stream = message.OpenProperty (win32com.mapi.mapitags.PR_BODY, pythoncom.IID_IStream, 0, 0)
text = u""
while True:
bytes = stream.read (CHUNK_SIZE)
if bytes:
text += bytes
else:
break
with codecs.open(resultFile, mode='w', encoding='utf-8') as a_file:
a_file.write(text)
打开上面写入的文件以搜索文本,这些行如下:
with codecs.open(absFilepath, 'rb', encoding='utf-8') as inFile :
for index, line in enumerate(inFile) :
mymatch = re.search(csResultEmailPattern, line, re.UNICODE)
#line = 'R\x00e\x00s\x00u\x00l\x00t\x00s\x00 \x00f\x00r\x00o\x00m\x00\n' #OR 线 = u'R\x00e\x00s\x00u\x00l\x00t\ x00s\x00 \x00f\x00r\x00o\x00m\x00\r'
我想知道它们是否是指定正则表达式(如 resultEmailPattern = ur'Results' 之类的有效方法,该正则表达式将与上面的 'Rx00e..line 匹配或更好的方式来编码 txt 文件