python - pyPDF在startxref处有错误的外部参照字符

Question

我正在使用 pyPDF 进行 pdf 页面提取和合并。我的问题并不完全依赖于 pyPDF，因为我过去在同一个 pdf 文件上遇到过与 pdfSharp 相同类型的错误。

问题是我在尝试阅读我们从供应商处收到的一些 pdf 文档时遇到错误。我不能告诉他们修复它们，所以我必须在我们这边处理它。现在我在java中使用iText来处理pdf合并并且这些文件没有任何问题，但是iText比pyPDF慢而且更难维护。pyPDF 在外部参照表中有一个供阅读的部分。在该部分中，有几个选项，行以“xref”开头，行以数字开头，或者行以“xref”开头，但在 x 之前有一个额外的字符。

在我的情况下，该行以“196 0 obj”开头，但下一行是“<< /Length 197 0 R”。pyPDF 和 pdfSharp 不承认这一点，他们试图将其读取为交叉引用并抛出异常。关于我可以做些什么来避免这种情况或修补 pyPDF 的任何建议？它可能格式不正确，但我需要像 Acrobat 和 iText 一样解决它。

这是 pyPDF 库中 pdf.py 的部分。这是很多代码，但重要的是系列中以if x == "x" 开头的 if 语句：

# read all cross reference tables and their trailers
    self.xref = {}
    self.xref_objStm = {}
    self.trailer = DictionaryObject()
    while 1:
        # load the xref table
        stream.seek(startxref, 0)
        x = stream.read(1)
        if x == "x":
            # standard cross-reference table
            ref = stream.read(4)
            if ref[:3] != "ref":
                raise utils.PdfReadError, "xref table read error"
            readNonWhitespace(stream)
            stream.seek(-1, 1)
            while 1:
                num = readObject(stream, self)
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                size = readObject(stream, self)
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                cnt = 0
                while cnt < size:
                    line = stream.read(20)
                    # It's very clear in section 3.4.3 of the PDF spec
                    # that all cross-reference table lines are a fixed
                    # 20 bytes.  However... some malformed PDF files
                    # use a single character EOL without a preceeding
                    # space.  Detect that case, and seek the stream
                    # back one character.  (0-9 means we've bled into
                    # the next xref entry, t means we've bled into the
                    # text "trailer"):
                    if line[-1] in "0123456789t":
                        stream.seek(-1, 1)
                    offset, generation = line[:16].split(" ")
                    offset, generation = int(offset), int(generation)
                    if not self.xref.has_key(generation):
                        self.xref[generation] = {}
                    if self.xref[generation].has_key(num):
                        # It really seems like we should allow the last
                        # xref table in the file to override previous
                        # ones. Since we read the file backwards, assume
                        # any existing key is already set correctly.
                        pass
                    else:
                        self.xref[generation][num] = offset
                    cnt += 1
                    num += 1
                readNonWhitespace(stream)
                stream.seek(-1, 1)
                trailertag = stream.read(7)
                if trailertag != "trailer":
                    # more xrefs!
                    stream.seek(-7, 1)
                else:
                    break
            readNonWhitespace(stream)
            stream.seek(-1, 1)
            newTrailer = readObject(stream, self)
            for key, value in newTrailer.items():
                if not self.trailer.has_key(key):
                    self.trailer[key] = value
            if newTrailer.has_key("/Prev"):
                startxref = newTrailer["/Prev"]
            else:
                break
        elif x.isdigit():
            # PDF 1.5+ Cross-Reference Stream
            stream.seek(-1, 1)
            idnum, generation = self.readObjectHeader(stream)
            xrefstream = readObject(stream, self)
            assert xrefstream["/Type"] == "/XRef"
            self.cacheIndirectObject(generation, idnum, xrefstream)
            streamData = StringIO(xrefstream.getData())
            idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
            entrySizes = xrefstream.get("/W")
            for num, size in self._pairs(idx_pairs):
                cnt = 0
                while cnt < size:
                    for i in range(len(entrySizes)):
                        d = streamData.read(entrySizes[i])
                        di = convertToInt(d, entrySizes[i])
                        if i == 0:
                            xref_type = di
                        elif i == 1:
                            if xref_type == 0:
                                next_free_object = di
                            elif xref_type == 1:
                                byte_offset = di
                            elif xref_type == 2:
                                objstr_num = di
                        elif i == 2:
                            if xref_type == 0:
                                next_generation = di
                            elif xref_type == 1:
                                generation = di
                            elif xref_type == 2:
                                obstr_idx = di
                    if xref_type == 0:
                        pass
                    elif xref_type == 1:
                        if not self.xref.has_key(generation):
                            self.xref[generation] = {}
                        if not num in self.xref[generation]:
                            self.xref[generation][num] = byte_offset
                    elif xref_type == 2:
                        if not num in self.xref_objStm:
                            self.xref_objStm[num] = [objstr_num, obstr_idx]
                    cnt += 1
                    num += 1
            trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
            for key in trailerKeys:
                if xrefstream.has_key(key) and not self.trailer.has_key(key):
                    self.trailer[NameObject(key)] = xrefstream.raw_get(key)
            if xrefstream.has_key("/Prev"):
                startxref = xrefstream["/Prev"]
            else:
                break
        else:
            # bad xref character at startxref.  Let's see if we can find
            # the xref table nearby, as we've observed this error with an
            # off-by-one before.
            stream.seek(-11, 1)
            tmp = stream.read(20)
            print tmp
            xref_loc = tmp.find("xref")
            if xref_loc != -1:
                startxref -= (10 - xref_loc)
                continue
            else:
                # no xref table found at specified location
                assert False
                break

注意：我的示例是在最后三行中抛出断言 False

python - pyPDF在startxref处有错误的外部参照字符

0 回答 0

Related

Reference