2

我正在尝试使用 iTextSharp 从 PDF 文件中提取图像。

该过程适用于我拥有的大多数 PDF 文件,但其他一些文件失败。

特别是,我观察到失败的 PDF 具有带有过滤器/ASCIIHexDecode/CCITTFaxDecode.

如何使用此过滤器解码图像?

仅供参考,我的图像提取例程是(pg对象正在使用PdfReader.GetPageN):

private static FindImages(PdfReader reader, PdfDictionary pdfPage)
{
    var imgPdfObject = FindImageInPDFDictionary(pdfPage);
    foreach (var image in imgPdfObject)
    {
        var xrefIndex = ((PRIndirectReference)image).Number;
        var stream = reader.GetPdfObject(xrefIndex);
        // Exception occurs here :
        var pdfImage = new PdfImageObject((PRStream)stream);
        img = (Bitmap)pdfImage.GetDrawingImage();

        // Do something with the image

    }
}
private static IEnumerable<PdfObject> FindImageInPDFDictionary(PdfDictionary pg)
{
    PdfDictionary res =
        (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));

    PdfDictionary xobj =
      (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
    if (xobj != null)
    {
        foreach (PdfName name in xobj.Keys)
        {
            PdfObject obj = xobj.Get(name);
            if (obj.IsIndirect())
            {
                PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);

                PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));

                //image at the root of the pdf
                if (PdfName.IMAGE.Equals(type))
                {
                    yield return obj;
                }// image inside a form
                else if (PdfName.FORM.Equals(type))
                {
                    foreach (var nestedObj in FindImageInPDFDictionary(tg))
                    {
                        yield return nestedObj;
                    }
                } //image inside a group
                else if (PdfName.GROUP.Equals(type))
                {
                    foreach (var nestedObj in FindImageInPDFDictionary(tg))
                    {
                        yield return nestedObj;
                    }
                }
            }
        }
    }
}

确切的例外是:

iTextSharp.text.exceptions.InvalidImageException: **Invalid code encountered while decoding 2D group 4 compressed data.**
  à iTextSharp.text.pdf.codec.TIFFFaxDecoder.DecodeT6(Byte[] buffer, Byte[] compData, Int32 startX, Int32 height, Int64 tiffT6Options)
  à iTextSharp.text.pdf.FilterHandlers.Filter_CCITTFAXDECODE.Decode(Byte[] b, PdfName filterName, PdfObject decodeParams, PdfDictionary streamDictionary)
  à iTextSharp.text.pdf.PdfReader.DecodeBytes(Byte[] b, PdfDictionary streamDictionary, IDictionary`2 filterHandlers)
  à iTextSharp.text.pdf.parser.PdfImageObject..ctor(PdfDictionary dictionary, Byte[] samples, PdfDictionary colorSpaceDic)
  à iTextSharp.text.pdf.parser.PdfImageObject..ctor(PRStream stream)
  à MyProject.MyClass.MyMethod(PdfReader reader, PdfDictionary pdfPage) dans c:\\sopmewhere\\PdfProcessor.cs:ligne 161

仅供参考:这是一个导致麻烦的示例 PDF:test.pdf

4

1 回答 1

0

在不深入了解您的代码示例的情况下,有一些 PDF 过滤器的替代实现,特别是一个非常简单的实现是以下PDFSharp-AsciiHexDecode.cs。希望它会有所帮助,因为替换实现的编码器和解码器iTextSharp应该很简单,并且应该允许验证数据是否损坏或解码器/编码器之一有错误。/CCITTFaxDecode不幸的是,在撰写本文时,我还没有手头的例子。

//
// Copyright (c) 2005-2016 empira Software GmbH, Cologne Area (Germany)
//
// http://www.pdfsharp.com
// http://sourceforge.net/projects/pdfsharp
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
// DEALINGS IN THE SOFTWARE.
#endregion

using System;

namespace PdfSharp.Pdf.Filters
{
    /// <summary>
    /// Implements the ASCIIHexDecode filter.
    /// </summary>
    public class AsciiHexDecode : Filter
    {
        // Reference: 3.3.1  ASCIIHexDecode Filter / Page 69

        /// <summary>
        /// Encodes the specified data.
        /// </summary>
        public override byte[] Encode(byte[] data)
        {
            if (data == null)
                throw new ArgumentNullException("data");

            int count = data.Length;
            byte[] bytes = new byte[2 * count];
            for (int i = 0, j = 0; i < count; i++)
            {
                byte b = data[i];
                bytes[j++] = (byte)((b >> 4) + ((b >> 4) < 10 ? (byte)'0' : (byte)('A' - 10)));
                bytes[j++] = (byte)((b & 0xF) + ((b & 0xF) < 10 ? (byte)'0' : (byte)('A' - 10)));
            }
            return bytes;
        }

        /// <summary>
        /// Decodes the specified data.
        /// </summary>
        public override byte[] Decode(byte[] data, FilterParms parms)
        {
            if (data == null)
                throw new ArgumentNullException("data");

            data = RemoveWhiteSpace(data);
            int count = data.Length;
            // Ignore EOD (end of data) character.
            // EOD can be anywhere in the stream, but makes sense only at the end of the stream.
            if (count > 0 && data[count - 1] == '>')
                --count;
            if (count % 2 == 1)
            {
                count++;
                byte[] temp = data;
                data = new byte[count];
                temp.CopyTo(data, 0);
            }
            count >>= 1;
            byte[] bytes = new byte[count];
            for (int i = 0, j = 0; i < count; i++)
            {
                // Must support 0-9, A-F, a-f - "Any other characters cause an error."
                byte hi = data[j++];
                byte lo = data[j++];
                if (hi >= 'a' && hi <= 'f')
                    hi -= 32;
                if (lo >= 'a' && lo <= 'f')
                    lo -= 32;
                // TODO Throw on invalid characters. Stop when encountering EOD. Add one more byte if EOD is the lo byte.
                bytes[i] = (byte)((hi > '9' ? hi - '7'/*'A' + 10*/: hi - '0') * 16 + (lo > '9' ? lo - '7'/*'A' + 10*/: lo - '0'));
            }
            return bytes;
        }
    }
}
于 2017-11-03T22:50:40.033 回答