我有一个用来从 HTML 表中提取数据的例程(对不起,我不相信原作者,我找到了这段代码,但不知道它来自哪里)。它解析表格字符串中的 HTML,并将单元格加载到数据集中。
Public Shared Function ConvertHtmlTablesToDataSet(html As String) As DataSet
Dim dt As DataTable
Dim ds As New DataSet()
dt = New DataTable()
Dim tableExpression As String = "<table[^>]*>(.*?)</table>"
Dim headerExpression As String = "<th[^>]*>(.*?)</th>"
Dim rowExpression As String = "<tr[^>]*>(.*?)</tr>"
Dim columnExpression As String = "<td[^>]*>(.*?)</td>"
Dim headersExist As Boolean = False
Dim iCurrentColumn As Integer = 0
Dim iCurrentRow As Integer = 0
Dim tables As MatchCollection = Regex.Matches(html, tableExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)
For Each table As Match In tables
iCurrentRow = 0
headersExist = False
dt = New DataTable()
If table.Value.Contains("<th") Then
headersExist = True
Dim headers As MatchCollection = Regex.Matches(table.Value, headerExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)
For Each header As Match In headers
dt.Columns.Add(header.Groups(1).ToString())
Next
Else
Dim myvar2222 As Integer = Regex.Matches(Regex.Matches(Regex.Matches(table.Value, tableExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)(0).ToString(), rowExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)(0).ToString(), columnExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase).Count
For iColumns As Integer = 1 To myvar2222
dt.Columns.Add("Column " + System.Convert.ToString(iColumns))
Next
End If
Dim rows As MatchCollection = Regex.Matches(table.Value, rowExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)
Try
For Each row As Match In rows
If Not ((iCurrentRow = 0) And headersExist) Then
Dim dr As DataRow = dt.NewRow()
iCurrentColumn = 0
Dim columns As MatchCollection = Regex.Matches(row.Value, columnExpression, RegexOptions.Singleline Or RegexOptions.Multiline Or RegexOptions.IgnoreCase)
For Each column As Match In columns
dr(iCurrentColumn) = column.Groups(1).ToString()
iCurrentColumn += 1
If iCurrentColumn = dt.Columns.Count Then Exit For
Next
dt.Rows.Add(dr)
End If
iCurrentRow += 1
Next
ds.Tables.Add(dt)
Catch ex As Exception
Stop
End Try
Next
Return ds
End Function