我正在尝试从 PDF 中提取文本以便进行分析,但是当我尝试从页面中提取文本时,我收到以下错误。
Traceback (most recent call last):
File "C:\Program Files (x86)\eclipse\plugins\org.python.pydev_2.7.4.2013051601\pysrc\pydevd_comm.py", line 765, in doIt
result = pydevd_vars.evaluateExpression(self.thread_id, self.frame_id, self.expression, self.doExec)
File "C:\Program Files (x86)\eclipse\plugins\org.python.pydev_2.7.4.2013051601\pysrc\pydevd_vars.py", line 376, in evaluateExpression
result = eval(compiled, updated_globals, frame.f_locals)
File "<string>", line 1, in <module>
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\pdf.py", line 1701, in extractText
content = ContentStream(content, self.pdf)
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\pdf.py", line 1783, in __init__
stream = StringIO(stream.getData())
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\generic.py", line 801, in getData
decoded._data = filters.decodeStreamData(self)
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\filters.py", line 228, in decodeStreamData
data = ASCII85Decode.decode(data)
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\filters.py", line 170, in decode
data = [y for y in data if not (y in ' \n\r\t')]
File "C:\Python33\lib\site-packages\pypdf2-1.9.0-py3.3.egg\PyPDF2\filters.py", line 170, in <listcomp>
data = [y for y in data if not (y in ' \n\r\t')]
TypeError: 'in <string>' requires string as left operand, not int
相关代码部分如下:
from PyPDF2 import PdfFileReader
for PDF_Entry in self.PDF_List:
Pdf_File = PdfFileReader(open(PDF_Entry, "rb"))
for pg_idx in range(0, Pdf_File.getNumPages()):
page_Content = Pdf_File.getPage(pg_idx).extractText()
for line in page_Content.split("\n"):
self.Analyse_Line(line)
在 extractText() 行抛出错误。