我正在尝试解析这个文件,但没有从网站上下载它。我已经用我硬盘上的文件运行了这个,我可以毫无问题地解析它,但是运行这个脚本它会跳闸。
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
我想我把网址整合错了。
import sys
import getopt
import urllib2
import datetime
import re
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFConverter, LTContainer, LTText, LTTextBox, LTImage
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from urllib2 import Request
# Define a PDF parser function
def parsePDF(url):
# Open the url provided as an argument to the function and read the content
open = urllib2.urlopen(Request(url)).read()
# Cast to StringIO object
from StringIO import StringIO
memory_file = StringIO(open)
# Create a PDF parser object associated with the StringIO object
parser = PDFParser(memory_file)
# Create a PDF document object that stores the document structure
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Define parameters to the PDF device objet
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
codec = 'utf-8'
Create a PDF device object
device = PDFDevice(rsrcmgr, retstr, codec = codec, laparams = laparams)
# Create a PDF interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# Construct the url
url = 'http://www.city.pittsburgh.pa.us/police/blotter/blotter_monday.pdf'