0

我有这个 vb.net 代码,它非常适合根据 PDF 包含的标准将所有 PDF 合并到目录路径中。新问题是该目录将有 1000 多个 pdf,并且用户将在 excel 的列中按文件名列出特定 PDF,该列需要根据特定日期的所述标准进行拆分/合并。

例如,一个目录有

ZTEST11.SAMPLE01

ZTEST12.SAMPLE02

ZTEST13.SAMPLE03

ZTEST14.SAMPLE04

ZTEST15.SAMPLE05

中测+1....

但在所有这些中,我在 A 列中的 excel 文件只列出了(并且行号每天都会改变):

ZTEST11.SAMPLE01

ZTEST13.SAMPLE03

ZTEST15.SAMPLE05

所以这些是我希望我的代码影响的唯一文件。

我的代码是这个

模块模块1

Class PageType
    Property Identifier As String
    Property TypeName As String
End Class

Sub Main(ByVal args As String())
    Dim xlApp As Excel.Application
    Dim xlWorkBook As Excel.Workbook
    Dim xlWorkSheet As Excel.Worksheet
    Dim range As Excel.Range
    xlApp = New Excel.Application
    xlWorkBook = xlApp.Workbooks.Open("C:\Users\XBorja.RESURGENCE\Desktop\xavier.xlsx")
    xlWorkSheet = xlWorkBook.Worksheets("sheet1")

    range = xlWorkSheet.UsedRange


    Dim dir = "G:\Word\Department Folders\Pre-Suit\Drafts-IL\2-IL_AttyReview\2018-09\Reviewed\"

    Dim unmerged = Combine(dir, "unmerged")
    ' Set up a list of the identifiers to be searched for and the corresponding names to be used in the filename.
    Dim pageTypes As New List(Of PageType)
    Dim ids = {"COVERSPLIT", "COMPLAINTSPLIT", "EXHIBITSPLIT", "MILSPLIT", "SUMSPLIT"}
    Dim nams = {" Cover Sheet ", " Complaint ", " Exhibit ", " Military ", " Summons "}

    ' For Each inputfile As String In Directory.GetFiles(dir, "*.pdf")
    For Each aCell In range

        MsgBox(aCell.Value)


        For Each inputfile As String In Combine(dir, aCell.value)
            For i = 0 To ids.Length - 1


                pageTypes.Add(New PageType With {.Identifier = ids(i), .TypeName = nams(i)})
            Next

            xlWorkBook.Close()
            xlApp.Quit()
            releaseObject(xlApp)
            releaseObject(xlWorkBook)
            releaseObject(xlWorkSheet)

            Dim extractor As New TextExtractor()

            ' Load sample PDF document
            extractor.LoadDocumentFromFile(inputfile)

            Dim pageCount = extractor.GetPageCount()
            Dim currentPageTypeName = "UNKNOWN"
            Dim Path As String = IO.Path.GetFileNameWithoutExtension(inputfile)
            Dim extracted = Path.Substring(0, 7)
            ' Search each page for a keyword 

            For i = 0 To pageCount - 1

                ' Find the type of the current page
                ' If it is not present on the page, then the last one found will be used.
                For Each pt In pageTypes
                    If extractor.Find(i, pt.Identifier, False) Then
                        currentPageTypeName = pt.TypeName
                    End If
                Next


                ' Extract page
                Using splitter As New DocumentSplitter() With {.OptimizeSplittedDocuments = True}
                    Dim pageNumber = i + 1   ' (!) page number in ExtractPage() is 1-based

                    If Not Directory.Exists(dir & "\unmerged") Then
                        Directory.CreateDirectory(dir & "\unmerged")
                    End If

                    Dim outputfile = Combine(unmerged, extracted & currentPageTypeName & pageNumber & ".pdf")

                    splitter.ExtractPage(inputfile, outputfile, pageNumber)

                    Console.WriteLine("Extracted page " & pageNumber & " to file """ & outputfile & """")

                End Using

            Next
            extractor.Dispose()

        Next ' for each 
    Next




    Call Xavier()
End Sub

如您所见,我在部分中添加了以便打开我的 excel 书,并向我读取 A 列中的每个单元格值,这些值是我要合并的 PDF 的文件号。

这很好用。但是我如何将这些值放入我的代码中,以便代码知道这些是我想要合并到该选定目录中的特定 PDF 文件?

您可以看到我注释掉的内容: For Each inputfile As String In Directory.GetFiles(dir, "*.pdf")

这就是我之前使用的,以便我的代码根据我定义的标准合并该目录中的所有 PDF。

如何更正此问题,以便我的单元格值变为字符串值,以便我的代码可以将每个单元格值作为 PDF 文件在我的目录中进行迭代,以选择它们进行合并?

4

1 回答 1

0

解决方案:

Option Infer On
'Option Strict On

Imports Bytescout.PDFExtractor
Imports System.Collections
Imports System.Collections.Generic
Imports System.IO.Path
Imports System.IO
Imports System.Linq
Imports System.Text
Imports System.Threading.Tasks
Imports System
Imports System.Diagnostics
Imports PdfSharp.Pdf
Imports PdfSharp.Pdf.IO
Imports System.Deployment
Imports ExcelDataReader
Imports Microsoft
Imports Microsoft.Office.Interop
Imports Microsoft.Office.Core
Imports Microsoft.Office.Interop.Excel


Module Module1

    Class PageType
        Property Identifier As String
        Property TypeName As String
    End Class

    Sub Main(ByVal args As String())
        Dim xlApp As Excel.Application
        Dim xlWorkBook As Excel.Workbook
        Dim xlWorkSheet As Excel.Worksheet
        Dim range As Excel.Range
        Dim aCell As Object

        xlApp = New Excel.Application
        xlWorkBook = xlApp.Workbooks.Open("C:\Users\XBorja.RESURGENCE\Desktop\xavier.xlsx")
        xlWorkSheet = xlWorkBook.Worksheets("sheet1")

        range = xlWorkSheet.UsedRange


        Dim dir = "G:\Word\Department Folders\Pre-Suit\Drafts-IL\2-IL_AttyReview\2018-09\Reviewed\"
        'Dim inputfile = Combine(dir, Obj.value)
        Dim unmerged = Combine(dir, "unmerged")
        ' Set up a list of the identifiers to be searched for and the corresponding names to be used in the filename.
        Dim pageTypes As New List(Of PageType)
        Dim ids = {"COVERSPLIT", "COMPLAINTSPLIT", "EXHIBITSPLIT", "MILSPLIT", "SUMSPLIT"}
        Dim nams = {" Cover Sheet ", " Complaint ", " Exhibit ", " Military ", " Summons "}

        ' For Each inputfile As String In Directory.GetFiles(dir, "*.pdf")
        For Each aCell In range
            MsgBox(aCell.Value)

            Dim file1 = aCell.Value & ".pdf"


            For Each inputfile As String In Directory.GetFiles(dir, file1)
                For i = 0 To ids.Length - 1


                    pageTypes.Add(New PageType With {.Identifier = ids(i), .TypeName = nams(i)})
                Next



                Dim extractor As New TextExtractor()

                ' Load sample PDF document
                extractor.LoadDocumentFromFile(inputfile)

                Dim pageCount = extractor.GetPageCount()
                Dim currentPageTypeName = "UNKNOWN"
                Dim Path As String = IO.Path.GetFileNameWithoutExtension(inputfile)
                Dim extracted = Path.Substring(0, 7)
                ' Search each page for a keyword 

                For i = 0 To pageCount - 1

                    ' Find the type of the current page
                    ' If it is not present on the page, then the last one found will be used.
                    For Each pt In pageTypes
                        If extractor.Find(i, pt.Identifier, False) Then
                            currentPageTypeName = pt.TypeName
                        End If
                    Next


                    ' Extract page
                    Using splitter As New DocumentSplitter() With {.OptimizeSplittedDocuments = True}
                        Dim pageNumber = i + 1   ' (!) page number in ExtractPage() is 1-based

                        If Not Directory.Exists(dir & "\unmerged") Then
                            Directory.CreateDirectory(dir & "\unmerged")
                        End If

                        Dim outputfile = Combine(unmerged, extracted & currentPageTypeName & pageNumber & ".pdf")

                        splitter.ExtractPage(inputfile, outputfile, pageNumber)

                        Console.WriteLine("Extracted page " & pageNumber & " to file """ & outputfile & """")

                    End Using

                Next
                extractor.Dispose()

            Next ' for each 
        Next

        xlWorkBook.Close()
        xlApp.Quit()
        releaseObject(xlApp)
        releaseObject(xlWorkBook)
        releaseObject(xlWorkSheet)


        Call Xavier()
    End Sub


    Private Sub releaseObject(ByVal aCell As Object)
        Try
            System.Runtime.InteropServices.Marshal.ReleaseComObject(aCell)
            aCell = Nothing
        Catch ex As Exception
            aCell = Nothing
        Finally
            GC.Collect()
        End Try
    End Sub
End Module


Module Module2
    Private inputdir As String = "G:\Word\Department Folders\Pre-Suit\Drafts-IL\2-IL_AttyReview\2018-09\Reviewed\unmerged\"


    Public Sub Xavier()

        MergeFiles("Cover Sheet", inputdir)
        MergeFiles("Complaint", inputdir)
        MergeFiles("Exhibit", inputdir)
        MergeFiles("Military", inputdir)
        MergeFiles("Summons", inputdir)
    End Sub

    Public Sub MergeFiles(ByVal name As String, inputdir As String)
        Dim OutputFile As String
        Dim OutputDir As String = inputdir & "\Merge\"
        Dim OutputDocument As PdfDocument

        If Not Directory.Exists(OutputDir) Then Directory.CreateDirectory(OutputDir)

        For Each files As String In Directory.GetFiles(inputdir, "*" & name & "*.pdf")
            OutputFile = GetFileNameWithoutExtension(files).Substring(0, 7) & " " & name & ".pdf"

            If File.Exists(OutputDir & OutputFile) Then
                OutputDocument = PdfReader.Open(OutputDir & OutputFile)
            Else
                OutputDocument = New PdfDocument()
            End If
            Console.WriteLine("Merging: {0}...", GetFileName(files))
            Using InputDocument As PdfDocument = PdfReader.Open(files, PdfDocumentOpenMode.Import)
                For Each page As PdfPage In InputDocument.Pages
                    OutputDocument.AddPage(page)
                Next
            End Using

            OutputDocument.Save(OutputDir & OutputFile)
            OutputDocument.Dispose()
        Next

    End Sub
End Module
于 2018-09-12T17:28:36.603 回答