0

我想在 pdf 文件中搜索一个单词并替换它。例如搜索“错误”并替换为“正确”。我已经设法使用 iText v5.5.9 (礼貌http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm)做一个测试程序有点工作正常(被替换的文本似乎在顶部)。我想知道 v7 iText 是否会更好/更简单,是否有人做过/可以提供帮助。下面是 v5 测试代码,它使用 r/w 密码从数据库中读取 pdf,然后使用 r/w 密码将其写入:

' Based on http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm

Imports System.IO 'Working With Files
Imports System.Text 'Working With Text
Imports System.Data.SqlClient

Imports iTextSharp.text 'Core PDF Text Functionalities
Imports iTextSharp.text.pdf 'PDF Content
Imports iTextSharp.text.pdf.parser 'Content Parser

Imports pdf_clr.LocTextExtraction 'Import LocationTextExtractionStrategy Capabilities

Public Class Class1

Public Shared Sub ReplacePDFText(ByVal strSource As String, ByVal strDest As String, ByVal iDocType As SByte, ByVal strSearch As String, ByVal strReplace As String, ByVal bCase As Boolean)
    ' strSource is an int
    Dim i As Integer
    Dim strSqlConnection As String = "context connection=true"
    strSqlConnection = "Data Source=SERVER;Initial Catalog=DATABASE;Integrated Security=True"
    Dim dbPDF As Byte() = Nothing 'For doc from database
    Dim pcbContent As PdfContentByte = Nothing 'Read PDF Content
    Dim psStamp As PdfStamper = Nothing 'PDF Stamper Object
    Dim strPassword As String = strSource

    Using connection As New SqlConnection(strSqlConnection)
        connection.Open()
        Dim command As New SqlCommand("SELECT pdf FROM docstore WHERE id=" & strSource, connection)
        dbPDF = command.ExecuteScalar()
    End Using

    If IsNothing(dbPDF) <> True Then 'Check if dbPDF filled

        'Dim pdfFileReader As New PdfReader(strSource, Encoding.ASCII.GetBytes(strPassword)) 'Read Our File
        Dim pdfFileReader As New PdfReader(dbPDF, Encoding.ASCII.GetBytes(strPassword)) 'Read PDF

        If strDest.ToString = "" Then
            'strDest = System.IO.Path.GetTempPath() & System.IO.Path.GetRandomFileName()
            strDest = "C:\tmp\" & System.IO.Path.GetRandomFileName() & ".pdf"
        End If

        Dim msPDF As New MemoryStream()

        psStamp = New PdfStamper(pdfFileReader, msPDF) 'Memorystream as destination
        psStamp.Writer.CloseStream = False

        ' set r/w password to
        psStamp.SetEncryption(Nothing, Encoding.ASCII.GetBytes(strPassword), PdfWriter.ALLOW_PRINTING, PdfWriter.DO_NOT_ENCRYPT_METADATA)

        For intCurrPage As Integer = 1 To pdfFileReader.NumberOfPages 'Loop Through All Pages

            Dim lteStrategy As LocTextExtractionStrategy = New LocTextExtractionStrategy 'Read PDF File Content Blocks

            pcbContent = psStamp.GetUnderContent(intCurrPage) 'Look At Current Block

            'Determine Spacing of Block To See If It Matches Our Search String
            lteStrategy.UndercontentCharacterSpacing = pcbContent.CharacterSpacing
            lteStrategy.UndercontentHorizontalScaling = pcbContent.HorizontalScaling

            'Trigger The Block Reading Process
            Dim currentText As String = PdfTextExtractor.GetTextFromPage(pdfFileReader, intCurrPage, lteStrategy)
            Dim scCase As StringComparison = IIf(bCase = 0, StringComparison.CurrentCultureIgnoreCase, StringComparison.CurrentCulture)

            'Call
            DoSearchReplace(lteStrategy, pcbContent, psStamp, strSearch, strReplace, scCase, "SearchReplaceLayer")

        Next 'page

        psStamp.Close() 'Close Stamp Destination Object

        msPDF.Position = 0

        dbPDF = msPDF.ToArray

        msPDF.Close()
        msPDF.Dispose()

        ' Write file as check during testing
        File.WriteAllBytes(strDest, dbPDF)

        If IsNumeric(strSource) And 1 = 1 Then
            Using connection As New SqlConnection(strSqlConnection)
                Dim cmd As New SqlCommand
                cmd.CommandText = "sp_DOCSTORE_ADD_binary" ' updates or inserts into db
                ' stored procedure parameters as needed
                cmd.Parameters.Add("@FILE", Data.SqlDbType.VarBinary) : cmd.Parameters("@FILE").Value = dbPDF
                cmd.Parameters.Add("@retvalue", Data.SqlDbType.Int).Direction = Data.ParameterDirection.ReturnValue
                cmd.CommandType = Data.CommandType.StoredProcedure
                cmd.Connection = connection

                connection.Open()

                i = cmd.ExecuteNonQuery()

            End Using
        End If

    End If

End Sub

Public Shared Sub DoSearchReplace(ByRef lteStrategy As LocTextExtractionStrategy, ByRef pcbContent As PdfContentByte, ByRef psStamp As PdfStamper, ByVal strSearch As String, ByVal strReplace As String, ByVal scCase As StringComparison, ByVal strLayer As String)
    'Determine Match(es)
    Dim lstMatches As List(Of iTextSharp.text.Rectangle) = lteStrategy.GetTextLocations(strSearch, scCase)
    Dim pdLayer As New PdfLayer(strLayer, psStamp.Writer) 'New layer and enable Overwriting Capabilities

    'Set Fill Colour Of Replacing Layer
    pcbContent.SetColorFill(BaseColor.WHITE)

    For Each rctRect As Rectangle In lstMatches 'Loop Through Each Match

        pcbContent.Rectangle(rctRect.Left, rctRect.Bottom, rctRect.Width, rctRect.Height) 'Create New Rectangle For Replacing Layer
        pcbContent.Fill() 'Fill With Colour Specified
        pcbContent.BeginLayer(pdLayer) 'Create Layer
        pcbContent.SetColorFill(BaseColor.DARK_GRAY) 'Fill Layer
        pcbContent.Fill() 'Fill Underlying Content

        Dim pgState As PdfGState 'Create GState Object
        pgState = New PdfGState()

        pcbContent.SetGState(pgState) 'Set Current State
        pcbContent.SetColorFill(BaseColor.BLACK) 'Fill Letters
        pcbContent.BeginText() 'Start Text Replace Procedure
        pcbContent.SetTextMatrix(rctRect.Left, rctRect.Bottom) 'Get Text Location

        'Set New Font And Size
        pcbContent.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA_OBLIQUE, BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 12)
        pcbContent.ShowText(strReplace) 'Replacing Text
        pcbContent.EndText() 'Stop Text Replace Procedure
        pcbContent.EndLayer() 'Stop Layer replace Procedure

    Next 'rectangle
End Sub

干杯。

4

1 回答 1

0

基本思想(伪代码)是

  1. 实现 IEventListener/ITextExtractionStrategy
  2. 将此类用作文档每一页的 PdfTextExtractor 的参数
  3. 文档中的每个事件都会通知您的班级。您对 TextRenderInfo 类型的事件(将文本呈现到页面的事件)感兴趣
  4. 聚合 TextRenderInfo 事件,并对它们进行排序(按逻辑阅读顺序)以获得文档中文本的概览
  5. 使用正则表达式搜索与所需属性匹配的所有文本,将文本映射回它们来自的 TextRenderInfo 对象
  6. 根据您收集的 TextRenderInfo 对象以及要替换的对象,重建 .pdf 文档
于 2017-04-05T09:29:08.903 回答