我正在阅读带有 iText 库版本 4.1.6 的 PDF 文件,通常一切正常。现在,当我阅读由 PDF 打印驱动程序(通过 MS Word 具有打印功能)创建的 PDF 时,我得到一些 ASCII 字符并且无法正确转换它们。一些 PDF 符号已正确转换,例如标记 BT(开始文本)、ET(结束文本)等。但是当涉及存储在 PDF 数组中的文本对象时(来自 PDF ISO,而不是在我的 C# 代码中! ) 单个字符具有奇怪的值。例如,我有一个“R”,但以字节为单位,它的值为“1”。在 ASCII 表中,R 是“82”(十进制)。所以我得到这个字符的'SOH'之类的值。其他库可以以某种方式转换它。有人可以告诉我如何将这个单字节转换为它的字母“R”吗?我搜索了几个小时,直到现在没有任何效果。
这是我如何阅读 PDF 文件的最新代码(iText v. 4.1.6)
public string ExtractPureText(string filename)
{
StringBuilder sb = new StringBuilder();
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(filename);
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
sb.AppendLine(ExtractPureTextFromPDFBytes(reader.GetPageContent(page), page) + " ");
}
return sb.ToString();
}
这是 ExtractPureTextFromPDFBytes 函数
public string ExtractPureTextFromPDFBytes(byte[] input, int pageNumber)
{
if (input == null || input.Length == 0) return "";
int readPosition = 0;
Encoding enc = new UnicodeEncoding(true, false);
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (readPosition = 0; readPosition < input.Length; readPosition++)
{
char c = (char)input[readPosition];
if (input[readPosition] == 213)
c = "'".ToCharArray()[0];
if (inTextObject)
{
byte[] b = new byte[2];
b[0] = 0;
b[1] = input[116];
byte[] d = new byte[1];
d[0] = input[116];
string bString = System.Text.Encoding.ASCII.GetString(b);
if (readPosition >= 114)
{
String t = new String((char)(input[116] & 0xff), 1);
}
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td", "'", "T*", "\"", "TJ", "Tj", "Tf" }, previousCharacters))
{
resultString += System.Environment.NewLine;
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters))
{
resultString += System.Environment.NewLine;
inTextObject = false;
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
//resultString += c.ToString();
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
//resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
resultString += c.ToString();
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
resultString += System.Environment.NewLine;
resultString += pageNumber.ToString() + " PN" + System.Environment.NewLine;
}
}
string output = string.Empty;
// clean up text, remove empty lines and trim lines
using (StringReader reader = new StringReader(resultString))
{
string line;
while ((line = reader.ReadLine()) != null)
{
line = line.Trim();
if (line != string.Empty)
{
output += line + System.Environment.NewLine;
}
}
}
return output;
}
catch
{
return "";
}
}
由于 lincensing,我绝对没有选择获得更高版本的 iText。仅当存在具有开发人员许可证的库时,必须为每台将安装我自己的软件的机器支付 iText 许可证。不幸的是,这对我来说是没有选择的。谢谢你的帮助