我正在使用 pyPDF 进行 pdf 页面提取和合并。我的问题并不完全依赖于 pyPDF,因为我过去在同一个 pdf 文件上遇到过与 pdfSharp 相同类型的错误。
问题是我在尝试阅读我们从供应商处收到的一些 pdf 文档时遇到错误。我不能告诉他们修复它们,所以我必须在我们这边处理它。现在我在java中使用iText来处理pdf合并并且这些文件没有任何问题,但是iText比pyPDF慢而且更难维护。pyPDF 在外部参照表中有一个供阅读的部分。在该部分中,有几个选项,行以“xref”开头,行以数字开头,或者行以“xref”开头,但在 x 之前有一个额外的字符。
在我的情况下,该行以“196 0 obj”开头,但下一行是“<< /Length 197 0 R”。pyPDF 和 pdfSharp 不承认这一点,他们试图将其读取为交叉引用并抛出异常。关于我可以做些什么来避免这种情况或修补 pyPDF 的任何建议?它可能格式不正确,但我需要像 Acrobat 和 iText 一样解决它。
这是 pyPDF 库中 pdf.py 的部分。这是很多代码,但重要的是系列中以if x == "x" 开头的 if 语句:
# read all cross reference tables and their trailers
self.xref = {}
self.xref_objStm = {}
self.trailer = DictionaryObject()
while 1:
# load the xref table
stream.seek(startxref, 0)
x = stream.read(1)
if x == "x":
# standard cross-reference table
ref = stream.read(4)
if ref[:3] != "ref":
raise utils.PdfReadError, "xref table read error"
readNonWhitespace(stream)
stream.seek(-1, 1)
while 1:
num = readObject(stream, self)
readNonWhitespace(stream)
stream.seek(-1, 1)
size = readObject(stream, self)
readNonWhitespace(stream)
stream.seek(-1, 1)
cnt = 0
while cnt < size:
line = stream.read(20)
# It's very clear in section 3.4.3 of the PDF spec
# that all cross-reference table lines are a fixed
# 20 bytes. However... some malformed PDF files
# use a single character EOL without a preceeding
# space. Detect that case, and seek the stream
# back one character. (0-9 means we've bled into
# the next xref entry, t means we've bled into the
# text "trailer"):
if line[-1] in "0123456789t":
stream.seek(-1, 1)
offset, generation = line[:16].split(" ")
offset, generation = int(offset), int(generation)
if not self.xref.has_key(generation):
self.xref[generation] = {}
if self.xref[generation].has_key(num):
# It really seems like we should allow the last
# xref table in the file to override previous
# ones. Since we read the file backwards, assume
# any existing key is already set correctly.
pass
else:
self.xref[generation][num] = offset
cnt += 1
num += 1
readNonWhitespace(stream)
stream.seek(-1, 1)
trailertag = stream.read(7)
if trailertag != "trailer":
# more xrefs!
stream.seek(-7, 1)
else:
break
readNonWhitespace(stream)
stream.seek(-1, 1)
newTrailer = readObject(stream, self)
for key, value in newTrailer.items():
if not self.trailer.has_key(key):
self.trailer[key] = value
if newTrailer.has_key("/Prev"):
startxref = newTrailer["/Prev"]
else:
break
elif x.isdigit():
# PDF 1.5+ Cross-Reference Stream
stream.seek(-1, 1)
idnum, generation = self.readObjectHeader(stream)
xrefstream = readObject(stream, self)
assert xrefstream["/Type"] == "/XRef"
self.cacheIndirectObject(generation, idnum, xrefstream)
streamData = StringIO(xrefstream.getData())
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
entrySizes = xrefstream.get("/W")
for num, size in self._pairs(idx_pairs):
cnt = 0
while cnt < size:
for i in range(len(entrySizes)):
d = streamData.read(entrySizes[i])
di = convertToInt(d, entrySizes[i])
if i == 0:
xref_type = di
elif i == 1:
if xref_type == 0:
next_free_object = di
elif xref_type == 1:
byte_offset = di
elif xref_type == 2:
objstr_num = di
elif i == 2:
if xref_type == 0:
next_generation = di
elif xref_type == 1:
generation = di
elif xref_type == 2:
obstr_idx = di
if xref_type == 0:
pass
elif xref_type == 1:
if not self.xref.has_key(generation):
self.xref[generation] = {}
if not num in self.xref[generation]:
self.xref[generation][num] = byte_offset
elif xref_type == 2:
if not num in self.xref_objStm:
self.xref_objStm[num] = [objstr_num, obstr_idx]
cnt += 1
num += 1
trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
for key in trailerKeys:
if xrefstream.has_key(key) and not self.trailer.has_key(key):
self.trailer[NameObject(key)] = xrefstream.raw_get(key)
if xrefstream.has_key("/Prev"):
startxref = xrefstream["/Prev"]
else:
break
else:
# bad xref character at startxref. Let's see if we can find
# the xref table nearby, as we've observed this error with an
# off-by-one before.
stream.seek(-11, 1)
tmp = stream.read(20)
print tmp
xref_loc = tmp.find("xref")
if xref_loc != -1:
startxref -= (10 - xref_loc)
continue
else:
# no xref table found at specified location
assert False
break
注意:我的示例是在最后三行中抛出断言 False