c# - C# 用 ASCII 字符读取 PDF

Question

我正在阅读带有 iText 库版本 4.1.6 的 PDF 文件，通常一切正常。现在，当我阅读由 PDF 打印驱动程序（通过 MS Word 具有打印功能）创建的 PDF 时，我得到一些 ASCII 字符并且无法正确转换它们。一些 PDF 符号已正确转换，例如标记 BT（开始文本）、ET（结束文本）等。但是当涉及存储在 PDF 数组中的文本对象时（来自 PDF ISO，而不是在我的 C# 代码中！ ) 单个字符具有奇怪的值。例如，我有一个“R”，但以字节为单位，它的值为“1”。在 ASCII 表中，R 是“82”（十进制）。所以我得到这个字符的'SOH'之类的值。其他库可以以某种方式转换它。有人可以告诉我如何将这个单字节转换为它的字母“R”吗？我搜索了几个小时，直到现在没有任何效果。

这是我如何阅读 PDF 文件的最新代码（iText v. 4.1.6）

public string ExtractPureText(string filename)
    {
        StringBuilder sb = new StringBuilder();

        // Create a reader for the given PDF file
        PdfReader reader = new PdfReader(filename);

        int totalLen = 68;
        float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;

        for (int page = 1; page <= reader.NumberOfPages; page++)
        {
            sb.AppendLine(ExtractPureTextFromPDFBytes(reader.GetPageContent(page), page) + " ");
        }

        return sb.ToString();
    }

这是 ExtractPureTextFromPDFBytes 函数

public string ExtractPureTextFromPDFBytes(byte[] input, int pageNumber)
    {
        if (input == null || input.Length == 0) return "";

        int readPosition = 0;
        Encoding enc = new UnicodeEncoding(true, false);

        try
        {
            string resultString = "";

            // Flag showing if we are we currently inside a text object
            bool inTextObject = false;

            // Flag showing if the next character is literal 
            // e.g. '\\' to get a '\' character or '\(' to get '('
            bool nextLiteral = false;

            // () Bracket nesting level. Text appears inside ()
            int bracketDepth = 0;

            // Keep previous chars to get extract numbers etc.:
            char[] previousCharacters = new char[_numberOfCharsToKeep];
            for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


            for (readPosition = 0; readPosition < input.Length; readPosition++)
            {
                char c = (char)input[readPosition];
                if (input[readPosition] == 213)
                    c = "'".ToCharArray()[0];


                if (inTextObject)
                {
                    byte[] b = new byte[2];
                    b[0] = 0;
                    b[1] = input[116];

                    byte[] d = new byte[1];
                    d[0] = input[116];
                    string bString = System.Text.Encoding.ASCII.GetString(b);

                    if (readPosition >= 114)
                    {
                        String t = new String((char)(input[116] & 0xff), 1);
                    }
                    // Position the text
                    if (bracketDepth == 0)
                    {
                        if (CheckToken(new string[] { "TD", "Td", "'", "T*", "\"", "TJ", "Tj", "Tf" }, previousCharacters))
                        {
                            resultString += System.Environment.NewLine;
                        }
                    }

                    // End of a text object, also go to a new line.
                    if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters))
                    {
                        resultString += System.Environment.NewLine;
                        inTextObject = false;
                    }
                    else
                    {
                        // Start outputting text
                        if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                        {
                            bracketDepth = 1;
                        }
                        else
                        {
                            // Stop outputting text
                            if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                            {
                                bracketDepth = 0;
                            }
                            else
                            {
                                // Just a normal text character:
                                if (bracketDepth == 1)
                                {
                                    // Only print out next character no matter what. 
                                    // Do not interpret.
                                    if (c == '\\' && !nextLiteral)
                                    {
                                        //resultString += c.ToString();
                                        nextLiteral = true;
                                    }
                                    else
                                    {
                                        if (((c >= ' ') && (c <= '~')) ||
                                            ((c >= 128) && (c < 255)))
                                        {
                                            //resultString += c.ToString();
                                        }

                                        nextLiteral = false;
                                    }
                                }
                            }
                        }
                    }
                }

                resultString += c.ToString();

                // Store the recent characters for 
                // when we have to go back for a checking
                for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                {
                    previousCharacters[j] = previousCharacters[j + 1];
                }
                previousCharacters[_numberOfCharsToKeep - 1] = c;

                // Start of a text object
                if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                {
                    inTextObject = true;
                    resultString += System.Environment.NewLine;
                    resultString += pageNumber.ToString() + " PN" + System.Environment.NewLine;
                }
            }

            string output = string.Empty;

            // clean up text, remove empty lines and trim lines
            using (StringReader reader = new StringReader(resultString))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    line = line.Trim();
                    if (line != string.Empty)
                    {
                        output += line + System.Environment.NewLine;
                    }
                }
            }

            return output;
        }
        catch
        {
            return "";
        }
    }

由于 lincensing，我绝对没有选择获得更高版本的 iText。仅当存在具有开发人员许可证的库时，必须为每台将安装我自己的软件的机器支付 iText 许可证。不幸的是，这对我来说是没有选择的。谢谢你的帮助

c# - C# 用 ASCII 字符读取 PDF

0 回答 0

Related

Reference