1

我需要从 PDF 文档中删除文本。我将 Aspose 用于当前使用的目的TextFragmentAbsorber

仅供参考,我不能使用任何其他 3rd 方库。

下面是我正在使用的代码:

private string DeleteMachineReadableCode(string inputFilePath)
    {
        var outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));

        try
        {
            // Open document
            Document pdfDocument = new Document(inputFilePath);

            // Create TextAbsorber object to find all the phrases matching the regular expression
            TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#"); 

            // Set text search option to specify regular expression usage
            TextSearchOptions textSearchOptions = new TextSearchOptions(true);


            textFragmentAbsorber.TextSearchOptions = textSearchOptions;

            // Accept the absorber for all pages
            pdfDocument.Pages.Accept(textFragmentAbsorber);

            // Get the extracted text fragments
            TextFragmentCollection textFragmentCollection = textFragmentAbsorber.TextFragments;

            // Loop through the fragments
            foreach (TextFragment textFragment in textFragmentCollection)
            {
                // Update text and other properties
                textFragment.Text = string.Empty;

                // Set to an instance of an object.
                textFragment.TextState.Font = FontRepository.FindFont("Verdana");
                textFragment.TextState.FontSize = 1;
                textFragment.TextState.ForegroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
                textFragment.TextState.BackgroundColor = Aspose.Pdf.Color.FromRgb(System.Drawing.Color.White);
            }

            pdfDocument.Save(outputFilePath);

        }
        finally
        {
            if (File.Exists(inputFilePath))
                File.Delete(inputFilePath);
        }

        return outputFilePath;
    }

如果要删除的内容在单个页面上,我可以替换内容。我的问题是,如果文本跨越多个页面,则 TextFragmentAbsorber 无法识别具有上述正则表达式模式(“ #START#((.|\r\n)*?)#END#”)的文本。

请建议是否可以对正则表达式执行任何操作或 Aspose 中的某些设置可以解决我的问题。

4

1 回答 1

1

如前所述,由于架构限制,我们不能承诺早日解决您报告的问题。但是,我们已经修改了代码片段以满足您的要求。

这个想法是在其中一个文档页面上查找从“#START#”开始的文本。然后在后续页面之一上查找以“#END#”结尾的文本。并且还处理放置在这两个页面之间的页面上的所有文本片段(如果存在)。

    private string DeleteMachineReadableCodeUpdated(string inputFilePath)
    {
    string outputFilePath = Path.Combine(Path.GetTempPath(), string.Format(@"{0}.pdf", Guid.NewGuid()));

try
{
    // Open document
    Document pdfDocument = new Document(inputFilePath);

    // Create TextAbsorber object to find all the phrases matching the regular expression
    TextFragmentAbsorber absorber = new TextFragmentAbsorber("#START#((.|\r\n)*?)#END#");

    // Set text search option to specify regular expression usage
    TextSearchOptions textSearchOptions = new TextSearchOptions(true);

    absorber.TextSearchOptions = textSearchOptions;

    // Accept the absorber for all pages
    pdfDocument.Pages.Accept(absorber);

    // Get the extracted text fragments
    TextFragmentCollection textFragmentCollection = absorber.TextFragments;

    // If pattern found on one of the pages
    if (textFragmentCollection.Count > 0)
    {
        RemoveTextFromFragmentCollection(textFragmentCollection);
    }
    else
    {
        // In case nothing was found tries to find by parts
        string startingPattern = "#START#((.|\r\n)*?)\\z";
        string endingPattern = "\\A((.|\r\n)*?)#END#";
        bool isStartingPatternFound = false;
        bool isEndingPatternFound = false;
        ArrayList fragmentsToRemove = new ArrayList();

        foreach (Page page in pdfDocument.Pages)
        {
            // If ending pattern was already found - do nothing
            if (isEndingPatternFound)
                continue;

            // If starting pattern was already found - activate textFragmentAbsorber with ending pattern
            absorber.Phrase = !isStartingPatternFound ? startingPattern : endingPattern;

            page.Accept(absorber);
            if (absorber.TextFragments.Count > 0)
            {
                // In case something is found - add it to list
                fragmentsToRemove.AddRange(absorber.TextFragments);

                if (isStartingPatternFound)
                {
                    // Both starting and ending patterns found - the document processing
                    isEndingPatternFound = true;                        
                    RemoveTextFromFragmentCollection(fragmentsToRemove);
                }
                else
                {
                    // Only starting pattern found yet - continue
                    isStartingPatternFound = true;                        
                }
            }
            else
            {
                // In case neither starting nor ending pattern are found on current page
                // If starting pattern was found previously - get all fragments from the page
                if (isStartingPatternFound)
                {
                    absorber.Phrase = String.Empty;
                    page.Accept(absorber);
                    fragmentsToRemove.AddRange(absorber.TextFragments);
                }
                // Otherwise do nothing (continue)
            }
        }
    }

    pdfDocument.Save(outputFilePath);
}
finally
{
    if (File.Exists(inputFilePath))
        File.Delete(inputFilePath);
}

return outputFilePath;
}

private void RemoveTextFromFragmentCollection(ICollection fragmentCollection)
{
// Loop through the fragments
foreach (TextFragment textFragment in fragmentCollection)
{
    textFragment.Text = string.Empty;
}
}

笔记:

  • 此代码假定文档中只有一个以“#START#”开头并以“#END#”结尾的文本块。然而,上面的代码可以很容易地修改来处理几个这样的块。
  • 您可以存储页码,而不是在中间页面上处理文本,而不是在保存文档之前使用 pdfDocument.Pages.Delete(pageNumber) 删除。如果它们不受欢迎,它可以避免“空白”页面。
于 2017-11-22T09:05:03.400 回答