这是这个问题的后续
我们想出了如何使用下面的代码提取标记为编辑的文本。
但是,除了标记为编辑的文本之外,还会捕获其他前导和尾随字符。
例如,如果在源 PDF 文档中将句子标记为编辑,则下面的代码还会捕获前一个句子的最后几个字符和下一个句子的前几个字符。
有人能在下面的代码中看到问题吗?
for (int i = 1; i <= rdr.NumberOfPages; i++)
{
// Get pages and corresponding annotations
PdfDictionary dict = rdr.GetPageN(i);
PdfArray annots = dict.GetAsArray(PdfName.ANNOTS);
foreach (var annItem in annots.ArrayList)
{
PdfDictionary d = PdfReader.GetPdfObject(annItem) as PdfDictionary;
PdfName typ = d.GetAsName(PdfName.SUBTYPE);
if (typ.ToString().StartsWith("/Redact"))
{
sb = new StringBuilder();
PdfObject o1 = d.Get(PdfName.QUADPOINTS);
PdfArray arr2 = o1 as PdfArray;
int numLines = arr2.ArrayList.Count / 8;
for (int k = 0; k < numLines; k++)
{
llx = float.Parse(arr2[(0 + k * 8)].ToString());
lly = float.Parse(arr2[(1 + k * 8)].ToString());
urx = float.Parse(arr2[(6 + k * 8)].ToString());
ury = float.Parse(arr2[(7 + k * 8)].ToString());
Rectangle rect = new Rectangle(llx, lly, urx, ury, 1);
List<RenderFilter> filters = new List<RenderFilter>();
filters.Add(new RegionTextRenderFilter(rect));
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
filters.ToArray<RenderFilter>());
// We get the text but get extra leading/trailing chars that are not in redaction.
sb.Append(PdfTextExtractor.GetTextFromPage(rdr, i, strategy));
}
Console.WriteLine("Page: " + i.ToString());
Console.WriteLine(sb.ToString() + Environment.NewLine);
sb.Clear();
}
}
}