我最近发现了 n-gram 以及将文本正文中的短语频率与其进行比较的很酷的可能性。现在我正在尝试制作一个简单的获取文本正文并返回最常用短语列表(其中 n >= 2)的 vb.net 应用程序。
我找到了一个如何从文本正文生成 n-gram 的 C# 示例,因此我开始将代码转换为 VB。问题是这段代码确实每个字符创建一克而不是每个单词一克。我想为单词使用的分隔符是:VbCrLf(新行)、vbTab(制表符)和以下字符:!@#$%^&*()_+-={}|\:\"'?¿ /.,<>'¡º×÷';«»[]
有谁知道我如何为此目的重写以下函数:
Friend Shared Function GenerateNGrams(ByVal text As String, ByVal gramLength As Integer) As String()
If text Is Nothing OrElse text.Length = 0 Then
Return Nothing
End If
Dim grams As New ArrayList()
Dim length As Integer = text.Length
If length < gramLength Then
Dim gram As String
For i As Integer = 1 To length
gram = text.Substring(0, (i) - (0))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
gram = text.Substring(length - 1, (length) - (length - 1))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Else
For i As Integer = 1 To gramLength - 1
Dim gram As String = text.Substring(0, (i) - (0))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
For i As Integer = 0 To (length - gramLength)
Dim gram As String = text.Substring(i, (i + gramLength) - (i))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
For i As Integer = (length - gramLength) + 1 To length - 1
Dim gram As String = text.Substring(i, (length) - (i))
If grams.IndexOf(gram) = -1 Then
grams.Add(gram)
End If
Next
End If
Return Tokeniser.ArrayListToArray(grams)
End Function