我有一个文件,即http://www.agfl.cs.ru.nl/papers/manual28.pdf (英文)
Pdfminer 和 poppler 在大多数解析页面中显示相同的结果,例如:
¾º¿  ÒÙ Öݸ ¾¼¼ Ⱥ ¾º ÂÙÒ ¸ ¾¼¼ ź Ë ÙØØ Ö¸ Ǻ Ë
它似乎无法读取字体自定义编码。如何指定?
这是代码示例:
# poppler
input_filename = '/tmp/manual28.pdf'
document = poppler.document_new_from_file('file://%s' % urllib.pathname2url(os.path.abspath(input_filename)), None)
n_pages = document.get_n_pages()
for i in range(n_pages):
page = document.get_page(i)
print page.get_text()
# chardet.detect(page.get_text()) # utf8 all time
# pdfminer
def pdf_to_html(in_fp, out_fp, codec='utf-8', maxpages=0, pagenos=None, html=True):
rsrcmgr = PDFResourceManager()
laparams = LAParams()
if isinstance(in_fp, basestring):
in_fp = open(in_fp, 'rb')
if isinstance(out_fp, basestring):
out_fp = open(out_fp, 'wb')
if html:
device = HTMLConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
else:
device = TextConverter(rsrcmgr, out_fp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(in_fp, pagenos, maxpages=maxpages):
interpreter.process_page(page)
in_fp.close()
device.close()
out_fp.close()