发布这个只是为了得到一段代码,它与 py35 一起用于类似 csv 的解析。列中的拆分是最简单的,但对我有用。
Crudos 以 tgray 在这个答案中作为起点。
也放入openpyxl,因为我更喜欢直接在excel中获得结果。
# works with py35 & pip-installed pdfminer.six in 2017
def pdf_to_csv(filename):
from io import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item._objs:
if isinstance(child, LTChar):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child.get_text()
# the line is now an unsorted dict
for y in sorted(lines.keys()):
line = lines[y]
# combine close letters to form columns
xpos = tuple(sorted(line.keys()))
new_line = []
temp_text = ''
for i in range(len(xpos)-1):
temp_text += line[xpos[i]]
if xpos[i+1] - xpos[i] > 8:
# the 8 is representing font-width
# needs adjustment for your specific pdf
new_line.append(temp_text)
temp_text = ''
# adding the last column which also manually needs the last letter
new_line.append(temp_text+line[xpos[-1]])
self.outfp.write(";".join(nl for nl in new_line))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
password = ""
maxpages = 0
caching = True
pagenos=set()
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp,
pagenos, maxpages=maxpages,
password=password,caching=caching,
check_extractable=True)):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
fn = 'your_file.pdf'
result = pdf_to_csv(fn)
lines = result.split('\n')
import openpyxl as pxl
wb = pxl.Workbook()
ws = wb.active
for line in lines:
ws.append(line.split(';'))
# appending a list gives a complete row in xlsx
wb.save('your_file.xlsx')