我有带有数据的 excel 表,我想获得它们之间的 Levenshtein 距离。我已经尝试导出为文本,从脚本(php)中读取,运行 Levenshtein(计算 Levenshtein 距离),再次将其保存到 excel 中。
但我正在寻找一种在 VBA 中以编程方式计算 Levenshtein 距离的方法。我该怎么做呢?
我有带有数据的 excel 表,我想获得它们之间的 Levenshtein 距离。我已经尝试导出为文本,从脚本(php)中读取,运行 Levenshtein(计算 Levenshtein 距离),再次将其保存到 excel 中。
但我正在寻找一种在 VBA 中以编程方式计算 Levenshtein 距离的方法。我该怎么做呢?
翻译自维基百科:
Option Explicit
Public Function Levenshtein(s1 As String, s2 As String)
Dim i As Integer
Dim j As Integer
Dim l1 As Integer
Dim l2 As Integer
Dim d() As Integer
Dim min1 As Integer
Dim min2 As Integer
l1 = Len(s1)
l2 = Len(s2)
ReDim d(l1, l2)
For i = 0 To l1
d(i, 0) = i
Next
For j = 0 To l2
d(0, j) = j
Next
For i = 1 To l1
For j = 1 To l2
If Mid(s1, i, 1) = Mid(s2, j, 1) Then
d(i, j) = d(i - 1, j - 1)
Else
min1 = d(i - 1, j) + 1
min2 = d(i, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
min2 = d(i - 1, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
d(i, j) = min1
End If
Next
Next
Levenshtein = d(l1, l2)
End Function
?Levenshtein("星期六","星期日")
3
感谢 smirkingman 的精彩代码帖子。这是一个优化的版本。
1)使用 Asc(Mid$(s1, i, 1) 代替。数值比较通常比文本快。
2) 使用 Mid$ 而不是 Mid,因为后者是变体版本。并添加 $ 是字符串版本。
3) 使用最少的应用程序功能。(仅个人喜好)
4) 使用 Long 而不是 Integers,因为它是 excel 原生使用的。
Function Levenshtein(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
If Asc(Mid$(string1, i, 1)) = Asc(Mid$(string2, j, 1)) Then
distance(i, j) = distance(i - 1, j - 1)
Else
distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
End If
Next
Next
Levenshtein = distance(string1_length, string2_length)
End Function
更新:
对于那些想要它的人:我认为可以肯定地说大多数人使用 Levenshtein 距离来计算模糊匹配百分比。这是一种方法,我添加了一个优化,您可以指定最小值。匹配 % 以返回(默认为 70%+。输入“50”或“80”等百分比或“0”以运行公式)。
速度提升来自这样一个事实,即该函数将通过检查 2 个字符串的长度来检查它是否在您给出的百分比范围内。请注意,有一些地方可以优化此功能,但为了便于阅读,我将其保留在此处。我在结果中连接了距离以证明功能,但你可以摆弄它:)
Function FuzzyMatch(ByVal string1 As String, _
ByVal string2 As String, _
Optional min_percentage As Long = 70) As String
Dim i As Long, j As Long
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long, result As Long
string1_length = Len(string1)
string2_length = Len(string2)
' Check if not too long
If string1_length >= string2_length * (min_percentage / 100) Then
' Check if not too short
If string1_length <= string2_length * ((200 - min_percentage) / 100) Then
ReDim distance(string1_length, string2_length)
For i = 0 To string1_length: distance(i, 0) = i: Next
For j = 0 To string2_length: distance(0, j) = j: Next
For i = 1 To string1_length
For j = 1 To string2_length
If Asc(Mid$(string1, i, 1)) = Asc(Mid$(string2, j, 1)) Then
distance(i, j) = distance(i - 1, j - 1)
Else
distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
End If
Next
Next
result = distance(string1_length, string2_length) 'The distance
End If
End If
If result <> 0 Then
FuzzyMatch = (CLng((100 - ((result / string1_length) * 100)))) & _
"% (" & result & ")" 'Convert to percentage
Else
FuzzyMatch = "Not a match"
End If
End Function
使用字节数组获得 17 倍的速度增益
Option Explicit
Public Declare Function GetTickCount Lib "kernel32" () As Long
Sub test()
Dim s1 As String, s2 As String, lTime As Long, i As Long
s1 = Space(100)
s2 = String(100, "a")
lTime = GetTickCount
For i = 1 To 100
LevenshteinStrings s1, s2 ' the original fn from Wikibooks and Stackoverflow
Next
Debug.Print GetTickCount - lTime; " ms" ' 3900 ms for all diff
lTime = GetTickCount
For i = 1 To 100
Levenshtein s1, s2
Next
Debug.Print GetTickCount - lTime; " ms" ' 234 ms
End Sub
'Option Base 0 assumed
'POB: fn with byte array is 17 times faster
Function Levenshtein(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long, bs1() As Byte, bs2() As Byte
Dim string1_length As Long
Dim string2_length As Long
Dim distance() As Long
Dim min1 As Long, min2 As Long, min3 As Long
string1_length = Len(string1)
string2_length = Len(string2)
ReDim distance(string1_length, string2_length)
bs1 = string1
bs2 = string2
For i = 0 To string1_length
distance(i, 0) = i
Next
For j = 0 To string2_length
distance(0, j) = j
Next
For i = 1 To string1_length
For j = 1 To string2_length
'slow way: If Mid$(string1, i, 1) = Mid$(string2, j, 1) Then
If bs1((i - 1) * 2) = bs2((j - 1) * 2) Then ' *2 because Unicode every 2nd byte is 0
distance(i, j) = distance(i - 1, j - 1)
Else
'distance(i, j) = Application.WorksheetFunction.Min _
(distance(i - 1, j) + 1, _
distance(i, j - 1) + 1, _
distance(i - 1, j - 1) + 1)
' spell it out, 50 times faster than worksheetfunction.min
min1 = distance(i - 1, j) + 1
min2 = distance(i, j - 1) + 1
min3 = distance(i - 1, j - 1) + 1
If min1 <= min2 And min1 <= min3 Then
distance(i, j) = min1
ElseIf min2 <= min1 And min2 <= min3 Then
distance(i, j) = min2
Else
distance(i, j) = min3
End If
End If
Next
Next
Levenshtein = distance(string1_length, string2_length)
End Function
我认为它变得更快......除了改进以前的代码以提高速度和结果之外没有做太多
' Levenshtein3 tweaked for UTLIMATE speed and CORRECT results
' Solution based on Longs
' Intermediate arrays holding Asc()make difference
' even Fixed length Arrays have impact on speed (small indeed)
' Levenshtein version 3 will return correct percentage
'
Function Levenshtein3(ByVal string1 As String, ByVal string2 As String) As Long
Dim i As Long, j As Long, string1_length As Long, string2_length As Long
Dim distance(0 To 60, 0 To 50) As Long, smStr1(1 To 60) As Long, smStr2(1 To 50) As Long
Dim min1 As Long, min2 As Long, min3 As Long, minmin As Long, MaxL As Long
string1_length = Len(string1): string2_length = Len(string2)
distance(0, 0) = 0
For i = 1 To string1_length: distance(i, 0) = i: smStr1(i) = Asc(LCase(Mid$(string1, i, 1))): Next
For j = 1 To string2_length: distance(0, j) = j: smStr2(j) = Asc(LCase(Mid$(string2, j, 1))): Next
For i = 1 To string1_length
For j = 1 To string2_length
If smStr1(i) = smStr2(j) Then
distance(i, j) = distance(i - 1, j - 1)
Else
min1 = distance(i - 1, j) + 1
min2 = distance(i, j - 1) + 1
min3 = distance(i - 1, j - 1) + 1
If min2 < min1 Then
If min2 < min3 Then minmin = min2 Else minmin = min3
Else
If min1 < min3 Then minmin = min1 Else minmin = min3
End If
distance(i, j) = minmin
End If
Next
Next
' Levenshtein3 will properly return a percent match (100%=exact) based on similarities and Lengths etc...
MaxL = string1_length: If string2_length > MaxL Then MaxL = string2_length
Levenshtein3 = 100 - CLng((distance(string1_length, string2_length) * 100) / MaxL)
End Function