我正在尝试使用 PDFMiner 解析目录中的 PDF 文件,我从复制此处包含的文档中的第一个脚本开始。代码(在下面重复)打开文件,并创建解析器对象,但在尝试创建文档对象时给出“意外 EOF”错误。任何有助于理解为什么会这样的帮助将不胜感激。具体来说,是否存在无法以这种方式解析的 PDF 类型?
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
import os
import re
fp = open(os.getcwd() + '\\' + os.listdir(os.getcwd())[0])
parser = PDFParser(fp)
doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
这会产生以下错误:
%run scrape_psr.py
---------------------------------------------------------------------------
PSEOF Traceback (most recent call last)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
195 else:
196 filename = fname
--> 197 exec compile(scripttext, filename, 'exec') in glob, loc
198 else:
199 def execfile(fname, *where):
22
23 parser = PDFParser(fp)
---> 24 doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error
25
26 if not doc.is_extractable:
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in __init__(self, parser, password, caching, fallback)
313 parser.fallback = True
314 xref = PDFXRefFallback()
--> 315 xref.load(parser)
316 self.xrefs.append(xref)
317 for xref in self.xrefs:
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in load(self, parser, debug)
173 # expand ObjStm.
174 parser.seek(pos)
--> 175 (_, obj) = parser.nextobject()
176 if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
177 stream = stream_value(obj)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nextobject(self)
555 """
556 while not self.results:
--> 557 (pos, token) = self.nexttoken()
558 #print (pos,token), (self.curtype, self.curstack)
559 if isinstance(token, (int, long, float, bool, str, PSLiteral)):
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nexttoken(self)
480 def nexttoken(self):
481 while not self._tokens:
--> 482 self.fillbuf()
483 self.charpos = self._parse1(self.buf, self.charpos)
484 token = self._tokens.pop(0)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in fillbuf(self)
213 self.buf = self.fp.read(self.BUFSIZ)
214 if not self.buf:
--> 215 raise PSEOF('Unexpected EOF')
216 self.charpos = 0
217 return
PSEOF: Unexpected EOF
编辑:有人建议问题出在外部参照表上,与此处的问题有关。我不相信,因为该错误跟踪没有提到“意外的 EOF”,而是“找不到 EOF 标记”。这样的解决方案可能是有效的,因为它本质上是说目前存在的 Python 包不太适合解析 PDF 文件标准的混乱。