1

我正在尝试使用 PDFMiner 解析目录中的 PDF 文件,我从复制此处包含的文档中的第一个脚本开始。代码(在下面重复)打开文件,并创建解析器对象,但在尝试创建文档对象时给出“意外 EOF”错误。任何有助于理解为什么会这样的帮助将不胜感激。具体来说,是否存在无法以这种方式解析的 PDF 类型?

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

import os
import re

fp = open(os.getcwd() + '\\' + os.listdir(os.getcwd())[0])

parser = PDFParser(fp)
doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error

if not doc.is_extractable:
    raise PDFTextExtractionNotAllowed

rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)

这会产生以下错误:

%run scrape_psr.py
---------------------------------------------------------------------------
PSEOF                                     Traceback (most recent call last)
C:\Users\Rob Lantz\Anaconda\lib\site-packages\IPython\utils\py3compat.pyc in execfile(fname, glob, loc)
    195             else:
    196                 filename = fname
--> 197             exec compile(scripttext, filename, 'exec') in glob, loc
    198     else:
    199         def execfile(fname, *where):


     22 
     23 parser = PDFParser(fp)
---> 24 doc = PDFDocument(parser) #This is the problem, getting an "unexpected EOF" error
     25 
     26 if not doc.is_extractable:

C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in __init__(self, parser, password, caching, fallback)
    313             parser.fallback = True
    314             xref = PDFXRefFallback()
--> 315             xref.load(parser)
    316             self.xrefs.append(xref)
    317         for xref in self.xrefs:

C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\pdfdocument.pyc in load(self, parser, debug)
    173             # expand ObjStm.
    174             parser.seek(pos)
--> 175             (_, obj) = parser.nextobject()
    176             if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
    177                 stream = stream_value(obj)

C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nextobject(self)
    555         """
    556         while not self.results:
--> 557             (pos, token) = self.nexttoken()
    558             #print (pos,token), (self.curtype, self.curstack)
    559             if isinstance(token, (int, long, float, bool, str, PSLiteral)):

C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in nexttoken(self)
    480     def nexttoken(self):
    481         while not self._tokens:
--> 482             self.fillbuf()
    483             self.charpos = self._parse1(self.buf, self.charpos)
    484         token = self._tokens.pop(0)

C:\Users\Rob Lantz\Anaconda\lib\site-packages\pdfminer-20140328-py2.7.egg\pdfminer\psparser.pyc in fillbuf(self)
    213         self.buf = self.fp.read(self.BUFSIZ)
    214         if not self.buf:
--> 215             raise PSEOF('Unexpected EOF')
    216         self.charpos = 0
    217         return

PSEOF: Unexpected EOF

编辑:有人建议问题出在外部参照表上,与此处的问题有关。我不相信,因为该错误跟踪没有提到“意外的 EOF”,而是“找不到 EOF 标记”。这样的解决方案可能是有效的,因为它本质上是说目前存在的 Python 包不太适合解析 PDF 文件标准的混乱。

4

0 回答 0