我正在尝试使用 PDFMiner 以一致的方式从 PDF 中提取信息,以便进行进一步分析,但我无法弄清楚如何正确提取表格数据。PDF Miner 似乎在行之前提取列。有没有人解决过这个问题或知道先提取行的方法?我尝试将其提取为 html,但遇到了同样的问题。任何帮助是极大的赞赏。
图片来自实际pdf:
提取版本的图像
我用于提取的代码如下:
import nltk
import numpy
import pip
import pdfminer
import dateutil
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
fstr = ''
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
fstr += str
fp.close()
device.close()
retstr.close()
return fstr
test1 = convert_pdf_to_txt("C:\Users\User\Documents\Contract\Dental\Certificate - Dental - Assurant - 2010.pdf")