我添加了这个,所以这个问题将在 SSS 的指导下得到完全的回答。这是用于搜索 office docs、office docs(x)、pdf 和其他通用文件格式的文本字符串的完整代码。
Imports System.IO
Imports System.Xml.XmlReader
Imports DocumentFormat.OpenXml.Packaging
Imports DocumentFormat.OpenXml.Wordprocessing
Imports DocumentFormat.OpenXml.Spreadsheet
Imports DocumentFormat.OpenXml
Imports System.Linq
Imports System
Imports System.Collections.Generic
Imports A = DocumentFormat.OpenXml.Drawing
Imports DocumentFormat.OpenXml.Presentation
Imports System.Text
Imports iTextSharp.text
Imports iTextSharp.text.pdf
Module searchFiles
Public readAllText As String
Public Sub startSearch(ByVal searchText As String)
MainForm.marketIntelligencelboxsearch.Items.Clear()
Dim dir_info As New DirectoryInfo("\\Max1\dept\")
ListFiles(MainForm.marketIntelligencelboxsearch, dir_info, searchText)
End Sub
Private Sub ListFiles(ByVal lst As ListView, ByVal dir_info As DirectoryInfo, ByVal target As String)
' Get the files in this directory.
Dim fs_infos() As FileInfo = dir_info.GetFiles("*.*")
For Each fs_info As FileInfo In fs_infos
If target = "ALL" Or fs_info.ToString().IndexOf(target, StringComparison.OrdinalIgnoreCase) >= 0 Then
MainForm.marketIntelligencelboxsearch.Items.Add(System.IO.Path.GetFileName(fs_info.FullName), MainForm.sourceFileImageIndex(fs_info.FullName))
Else
readAllText = File.ReadAllText(fs_info.FullName)
If fileExtention(fs_info.FullName, target) <> 0 Then
MainForm.marketIntelligencelboxsearch.Items.Add(System.IO.Path.GetFileName(fs_info.FullName), MainForm.sourceFileImageIndex(fs_info.FullName))
End If
End If
Next fs_info
fs_infos = Nothing
' Search subdirectories.
Dim subdirs() As DirectoryInfo = dir_info.GetDirectories()
For Each subdir As DirectoryInfo In subdirs
ListFiles(lst, subdir, target)
Next subdir
End Sub
Public Function fileExtention(ByVal sourcePath As String, ByVal target As String) As Integer
Dim searchResult As Integer
Select Case True
Case InStr(sourcePath, ".docx") <> 0 Or InStr(sourcePath, ".docm")
searchResult = WordProcessing(sourcePath, target)
Return searchResult
Case InStr(LCase(sourcePath), ".xlsx") <> 0 Or InStr(LCase(sourcePath), ".xlsm") <> 0
searchResult = ExcelProcessing(sourcePath, target)
Return searchResult
Case InStr(LCase(sourcePath), ".pptx") <> 0 Or InStr(LCase(sourcePath), ".pptm") <> 0
'will read slide text and notes
searchResult = PowerpointProcessing(sourcePath, target)
Return searchResult
Case InStr(LCase(sourcePath), ".pdf") <> 0
'will search text in pdf
searchResult = pdfProcesssing(sourcePath, target)
Return searchResult
Case Else
'looks at office docs before 2007 and all other generic extensions, includes Access 2007 and lower
searchResult = catchallProcessing(readAllText, target)
Return searchResult
End Select
End Function
区域“搜索索引”
Public Function catchallProcessing(ByVal strDoc As String, ByVal target As String) As Integer
If Not (strDoc) Is Nothing Then
If strDoc.IndexOf(target, StringComparison.OrdinalIgnoreCase) >= 0 Then 'means it ignores the case, no indexof = searching inside
Return 1
Else
Return 0
End If
Else
Return 0
End If
End Function
结束区域
区域“Word 2007 处理”
Public Function WordProcessing(ByVal strDoc As String, ByVal target As String) As Integer ' Word 2007 and Higher
Dim txt As String
Dim stream As Stream = File.Open(strDoc, FileMode.Open)
Dim wordprocessingDocument As WordprocessingDocument = wordprocessingDocument.Open(stream, True)
Dim body As Body = wordprocessingDocument.MainDocumentPart.Document.Body
txt = body.InnerText.ToString
Return catchallProcessing(txt, target) 'should return 0 or 1
wordprocessingDocument.Close()
stream.Close()
End Function
结束区域
区域“Excel 2007 处理”
Public Function ExcelProcessing(ByVal strDoc As String, ByVal target As String) As Integer 'Excel 2007 and Higher
Dim spreadsheetDocument As SpreadsheetDocument = spreadsheetDocument.Open(strDoc, False)
Dim workbookPart As WorkbookPart = spreadsheetDocument.WorkbookPart
Dim shareStringPart As SharedStringTablePart = workbookPart.SharedStringTablePart
Dim paragraphText As New StringBuilder()
For Each Item As SharedStringItem In shareStringPart.SharedStringTable.Elements(Of SharedStringItem)()
paragraphText.Append(Item.InnerText) 'should read all strings
Next
Return catchallProcessing(paragraphText.ToString(), target)
End Function
结束区域
区域“Powerpoint 2007 处理”
Public Function PowerpointProcessing(ByVal file As String, ByVal target As String) As Integer
Dim numberOfSlides As Integer = CountSlides(file)
Dim slideText As String = Nothing
Dim totalText As String = Nothing
For i As Integer = 0 To numberOfSlides - 1
GetSlideIdandText(slideText, file, i)
totalText = totalText & slideText
'System.Console.WriteLine("Slide #{0} contains: {1}", i + 1, slideText)
Next
Return catchallProcessing(totalText, target)
End Function
Public Function CountSlides(ByVal presentationFile As String) As Integer
Using powerpointDocument As PresentationDocument = PresentationDocument.Open(presentationFile, False)
Return CountSlides(powerpointDocument)
End Using
End Function
Public Function CountSlides(ByVal powerpointDocument As PresentationDocument) As Integer
If powerpointDocument Is Nothing Then
Throw New ArgumentNullException("presentationDocument")
End If
Dim slidesCount As Integer = 0
Dim presentationPart As PresentationPart = powerpointDocument.PresentationPart
If presentationPart IsNot Nothing Then
slidesCount = presentationPart.SlideParts.Count()
End If
Return slidesCount
End Function
Public Function GetSlideIdandText(ByRef sldText As String, ByVal docName As String, ByVal index As Integer)
Using ppt As PresentationDocument = PresentationDocument.Open(docName, False)
Dim part As PresentationPart = ppt.PresentationPart
Dim slideIDs As OpenXmlElementList = part.Presentation.SlideIdList.ChildElements
Dim relID As String = TryCast(slideIDs(index), SlideId).RelationshipId
Dim slide As SlidePart = DirectCast(part.GetPartById(relID), SlidePart)
Dim notesSlide As NotesSlidePart = slide.NotesSlidePart
Dim sn As NotesSlide = notesSlide.NotesSlide
Dim textx As IEnumerable(Of A.Text) = sn.Descendants(Of A.Text)()
Dim notesText As New StringBuilder()
For Each text As A.Text In textx
notesText.Append(text.Text)
Next
Dim paragraphText As New StringBuilder()
Dim texts As IEnumerable(Of A.Text) = slide.Slide.Descendants(Of A.Text)()
For Each text As A.Text In texts
paragraphText.Append(text.Text)
Next
sldText = paragraphText.ToString() & notesText.ToString() 'concatenates the notes and slide text for searching
End Using
End Function
结束区域
区域“PDF 处理”
Public Function pdfProcesssing(ByVal strDoc As String, ByVal target As String) As Integer
Dim oReader As New iTextSharp.text.pdf.PdfReader(strDoc)
Dim stringOut As StringBuilder = New StringBuilder()
If File.Exists(strDoc) Then
For i = 1 To oReader.NumberOfPages
Dim itsText As New iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy
stringOut.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(oReader, i, itsText))
Next
End If
Return catchallProcessing(stringOut.ToString(), target)
End Function
结束区域
End Module