我对 T-SQL 计算 Levenshtein 距离的算法感兴趣。
7 回答
我在 TSQL 中实现了标准的 Levenshtein 编辑距离函数,并进行了一些优化,从而提高了我所知道的其他版本的速度。在两个字符串的开头有共同字符(共享前缀)、结尾有共同字符(共享后缀)的情况下,并且当字符串很大并且提供了最大编辑距离时,速度的提高是显着的。例如,当输入是两个非常相似的 4000 个字符串,并且指定最大编辑距离为 2 时,这几乎比edit_distance_within
接受答案中的函数,在 0.073 秒(73 毫秒)与 55 秒内返回答案。它还具有内存效率,使用的空间等于两个输入字符串中较大的一个加上一些常量空间。它使用表示一列的单个 nvarchar“数组”,并在其中进行所有计算,以及一些辅助 int 变量。
优化:
- 跳过共享前缀和/或后缀的处理
- 如果较大的字符串以整个较小的字符串开始或结束,则提前返回
- 如果尺寸差异保证将超过最大距离,则提前返回
- 仅使用表示矩阵中的列的单个数组(实现为 nvarchar)
- 当给定最大距离时,时间复杂度从 (len1*len2) 变为 (min(len1,len2)) 即线性
- 当给定最大距离时,一旦已知无法实现最大距离界限,就尽早返回
这是代码(2014 年 1 月 20 日更新以加快速度):
-- =============================================
-- Computes and returns the Levenshtein edit distance between two strings, i.e. the
-- number of insertion, deletion, and sustitution edits required to transform one
-- string to the other, or NULL if @max is exceeded. Comparisons use the case-
-- sensitivity configured in SQL Server (case-insensitive by default).
--
-- Based on Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
-- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm,
-- with some additional optimizations.
-- =============================================
CREATE FUNCTION [dbo].[Levenshtein](
@s nvarchar(4000)
, @t nvarchar(4000)
, @max int
)
RETURNS int
WITH SCHEMABINDING
AS
BEGIN
DECLARE @distance int = 0 -- return variable
, @v0 nvarchar(4000)-- running scratchpad for storing computed distances
, @start int = 1 -- index (1 based) of first non-matching character between the two string
, @i int, @j int -- loop counters: i for s string and j for t string
, @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
, @left int -- distance in cell to the left if we were using an m by n matrix
, @sChar nchar -- character at index i from s string
, @thisJ int -- temporary storage of @j to allow SELECT combining
, @jOffset int -- offset used to calculate starting value for j loop
, @jEnd int -- ending value for j loop (stopping point for processing a column)
-- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
, @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string
, @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string
, @lenDiff int -- difference in length between the two strings
-- if strings of different lengths, ensure shorter string is in s. This can result in a little
-- faster speed by spending more time spinning just the inner loop during the main processing.
IF (@sLen > @tLen) BEGIN
SELECT @v0 = @s, @i = @sLen -- temporarily use v0 for swap
SELECT @s = @t, @sLen = @tLen
SELECT @t = @v0, @tLen = @i
END
SELECT @max = ISNULL(@max, @tLen)
, @lenDiff = @tLen - @sLen
IF @lenDiff > @max RETURN NULL
-- suffix common to both strings can be ignored
WHILE(@sLen > 0 AND SUBSTRING(@s, @sLen, 1) = SUBSTRING(@t, @tLen, 1))
SELECT @sLen = @sLen - 1, @tLen = @tLen - 1
IF (@sLen = 0) RETURN @tLen
-- prefix common to both strings can be ignored
WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1))
SELECT @start = @start + 1
IF (@start > 1) BEGIN
SELECT @sLen = @sLen - (@start - 1)
, @tLen = @tLen - (@start - 1)
-- if all of shorter string matches prefix and/or suffix of longer string, then
-- edit distance is just the delete of additional characters present in longer string
IF (@sLen <= 0) RETURN @tLen
SELECT @s = SUBSTRING(@s, @start, @sLen)
, @t = SUBSTRING(@t, @start, @tLen)
END
-- initialize v0 array of distances
SELECT @v0 = '', @j = 1
WHILE (@j <= @tLen) BEGIN
SELECT @v0 = @v0 + NCHAR(CASE WHEN @j > @max THEN @max ELSE @j END)
SELECT @j = @j + 1
END
SELECT @jOffset = @max - @lenDiff
, @i = 1
WHILE (@i <= @sLen) BEGIN
SELECT @distance = @i
, @diag = @i - 1
, @sChar = SUBSTRING(@s, @i, 1)
-- no need to look beyond window of upper left diagonal (@i) + @max cells
-- and the lower right diagonal (@i - @lenDiff) - @max cells
, @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END
, @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END
WHILE (@j <= @jEnd) BEGIN
-- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix)
SELECT @left = UNICODE(SUBSTRING(@v0, @j, 1))
, @thisJ = @j
SELECT @distance =
CASE WHEN (@sChar = SUBSTRING(@t, @j, 1)) THEN @diag --match, no change
ELSE 1 + CASE WHEN @diag < @left AND @diag < @distance THEN @diag --substitution
WHEN @left < @distance THEN @left -- insertion
ELSE @distance -- deletion
END END
SELECT @v0 = STUFF(@v0, @thisJ, 1, NCHAR(@distance))
, @diag = @left
, @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff) then @jEnd + 2 else @thisJ + 1 end
END
SELECT @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END
END
RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END
END
如该函数的注释中所述,字符比较的大小写敏感性将遵循有效的排序规则。默认情况下,SQL Server 的排序规则会导致不区分大小写的比较。将此函数修改为始终区分大小写的一种方法是将特定排序规则添加到比较字符串的两个位置。但是,我还没有彻底测试这一点,尤其是当数据库使用非默认排序规则时的副作用。这些是如何更改这两行以强制区分大小写的比较:
-- prefix common to both strings can be ignored
WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1) COLLATE SQL_Latin1_General_Cp1_CS_AS)
和
SELECT @distance =
CASE WHEN (@sChar = SUBSTRING(@t, @j, 1) COLLATE SQL_Latin1_General_Cp1_CS_AS) THEN @diag --match, no change
Arnold Fribble 在 sqlteam.com/forums 上有两个提案
- 2005 年 6 月的一份和
- 2006 年 5 月的另一更新
这是 2006 年更年轻的一个:
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_NULLS ON
GO
CREATE FUNCTION edit_distance_within(@s nvarchar(4000), @t nvarchar(4000), @d int)
RETURNS int
AS
BEGIN
DECLARE @sl int, @tl int, @i int, @j int, @sc nchar, @c int, @c1 int,
@cv0 nvarchar(4000), @cv1 nvarchar(4000), @cmin int
SELECT @sl = LEN(@s), @tl = LEN(@t), @cv1 = '', @j = 1, @i = 1, @c = 0
WHILE @j <= @tl
SELECT @cv1 = @cv1 + NCHAR(@j), @j = @j + 1
WHILE @i <= @sl
BEGIN
SELECT @sc = SUBSTRING(@s, @i, 1), @c1 = @i, @c = @i, @cv0 = '', @j = 1, @cmin = 4000
WHILE @j <= @tl
BEGIN
SET @c = @c + 1
SET @c1 = @c1 - CASE WHEN @sc = SUBSTRING(@t, @j, 1) THEN 1 ELSE 0 END
IF @c > @c1 SET @c = @c1
SET @c1 = UNICODE(SUBSTRING(@cv1, @j, 1)) + 1
IF @c > @c1 SET @c = @c1
IF @c < @cmin SET @cmin = @c
SELECT @cv0 = @cv0 + NCHAR(@c), @j = @j + 1
END
IF @cmin > @d BREAK
SELECT @cv1 = @cv0, @i = @i + 1
END
RETURN CASE WHEN @cmin <= @d AND @c <= @d THEN @c ELSE -1 END
END
GO
IIRC,使用 SQL Server 2005 及更高版本,您可以使用任何 .NET 语言编写存储过程:在 SQL Server 2005 中使用 CLR 集成。这样,编写一个计算Levenstein distance的程序就不难了。
一个简单的Hello, World!从帮助中提取:
using System;
using System.Data;
using Microsoft.SqlServer.Server;
using System.Data.SqlTypes;
public class HelloWorldProc
{
[Microsoft.SqlServer.Server.SqlProcedure]
public static void HelloWorld(out string text)
{
SqlContext.Pipe.Send("Hello world!" + Environment.NewLine);
text = "Hello world!";
}
}
然后在您的 SQL Server 中运行以下命令:
CREATE ASSEMBLY helloworld from 'c:\helloworld.dll' WITH PERMISSION_SET = SAFE
CREATE PROCEDURE hello
@i nchar(25) OUTPUT
AS
EXTERNAL NAME helloworld.HelloWorldProc.HelloWorld
现在您可以测试运行它:
DECLARE @J nchar(25)
EXEC hello @J out
PRINT @J
希望这可以帮助。
您可以使用 Levenshtein 距离算法来比较字符串
在这里,您可以在http://www.kodyaz.com/articles/fuzzy-string-matching-using-levenshtein-distance-sql-server.aspx找到一个 T-SQL 示例
CREATE FUNCTION edit_distance(@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
DECLARE @s1_len int, @s2_len int
DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int
DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000)
SELECT
@s1_len = LEN(@s1),
@s2_len = LEN(@s2),
@cv1 = 0x0000,
@j = 1, @i = 1, @c = 0
WHILE @j <= @s2_len
SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1
WHILE @i <= @s1_len
BEGIN
SELECT
@s1_char = SUBSTRING(@s1, @i, 1),
@c = @i,
@cv0 = CAST(@i AS binary(2)),
@j = 1
WHILE @j <= @s2_len
BEGIN
SET @c = @c + 1
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) +
CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END
IF @c > @c_temp SET @c = @c_temp
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1
IF @c > @c_temp SET @c = @c_temp
SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1
END
SELECT @cv1 = @cv0, @i = @i + 1
END
RETURN @c
END
(约瑟夫伽马开发的功能)
用法 :
select
dbo.edit_distance('Fuzzy String Match','fuzzy string match'),
dbo.edit_distance('fuzzy','fuzy'),
dbo.edit_distance('Fuzzy String Match','fuzy string match'),
dbo.edit_distance('levenshtein distance sql','levenshtein sql server'),
dbo.edit_distance('distance','server')
该算法仅返回 stpe 计数,通过一步替换不同字符将一个字符串更改为另一个字符串
我也在寻找 Levenshtein 算法的代码示例,很高兴在这里找到它。当然,我想了解该算法是如何工作的,并且我正在玩一些上面的示例,我正在玩一些由Veve发布的示例。为了更好地理解代码,我用矩阵创建了一个 EXCEL。
图像说超过 1000 个字。
有了这个 EXCEL,我发现有可能进行额外的性能优化。右上角红色区域的所有值都不需要计算。每个红色单元格的值导致左侧单元格的值加 1。这是因为,该区域中的第二个字符串总是比第一个字符串长,这使每个字符的距离增加了 1。
您可以通过使用语句IF @j <= @i并在此语句之前增加 @i 的值来反映这一点。
CREATE FUNCTION [dbo].[f_LevenshteinDistance](@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
DECLARE @s1_len int;
DECLARE @s2_len int;
DECLARE @i int;
DECLARE @j int;
DECLARE @s1_char nchar;
DECLARE @c int;
DECLARE @c_temp int;
DECLARE @cv0 varbinary(8000);
DECLARE @cv1 varbinary(8000);
SELECT
@s1_len = LEN(@s1),
@s2_len = LEN(@s2),
@cv1 = 0x0000 ,
@j = 1 ,
@i = 1 ,
@c = 0
WHILE @j <= @s2_len
SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1;
WHILE @i <= @s1_len
BEGIN
SELECT
@s1_char = SUBSTRING(@s1, @i, 1),
@c = @i ,
@cv0 = CAST(@i AS binary(2)),
@j = 1;
SET @i = @i + 1;
WHILE @j <= @s2_len
BEGIN
SET @c = @c + 1;
IF @j <= @i
BEGIN
SET @c_temp = CAST(SUBSTRING(@cv1, @j + @j - 1, 2) AS int) + CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END;
IF @c > @c_temp SET @c = @c_temp
SET @c_temp = CAST(SUBSTRING(@cv1, @j + @j + 1, 2) AS int) + 1;
IF @c > @c_temp SET @c = @c_temp;
END;
SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1;
END;
SET @cv1 = @cv0;
END;
RETURN @c;
END;
在 TSQL 中,比较两个项目的最好和最快的方法是 SELECT 语句,它在索引列上连接表。因此,如果您想从 RDBMS 引擎的优势中受益,我建议您采用这种方式来实现编辑距离。TSQL 循环也可以工作,但在其他语言中,Levenstein 距离计算将比在 TSQL 中进行大量比较更快。
我已经在几个系统中实现了编辑距离,使用了一系列针对仅为该目的而设计的临时表的连接。它需要一些繁重的预处理步骤 - 临时表的准备 - 但它适用于大量比较。
简而言之:预处理包括创建、填充和索引临时表。第一个包含参考 ID、一个字母列和一个 charindex 列。该表是通过运行一系列插入查询来填充的,这些查询将每个单词拆分为字母(使用 SELECT SUBSTRING)以创建与源列表中的单词有字母一样多的行(我知道,这是很多行,但 SQL Server 可以处理数十亿行数)。然后制作第二个包含 2 个字母列的表,另一个包含 3 个字母列的表,等等。最终结果是一系列表,其中包含每个单词的参考 id 和子字符串,以及它们的位置参考在这个词中。
完成此操作后,整个游戏就是复制这些表并将它们与它们的副本连接到 GROUP BY 选择查询中,该查询计算匹配数。这为每对可能的单词创建了一系列度量,然后将这些度量重新聚合为每对单词的单个 Levenstein 距离。
从技术上讲,这与 Levenstein 距离(或其变体)的大多数其他实现非常不同,因此您需要深入了解 Levenstein 距离的工作原理以及它为何如此设计。还要研究替代方案,因为使用该方法您最终会得到一系列基础指标,这些指标可以帮助同时计算编辑距离的许多变体,从而为您提供有趣的机器学习潜在改进。
本页之前的答案已经提到的另一点:尝试尽可能多地进行预处理以消除不需要距离测量的对。例如,应该排除没有一个共同字母的两个单词对,因为编辑距离可以从字符串的长度获得。或者不要测量同一个单词的两个副本之间的距离,因为它本质上是 0。或者在进行测量之前删除重复项,如果您的单词列表来自长文本,那么相同的单词可能会出现多次,因此仅测量一次距离将节省处理时间等。
我的 Azure Synapse 模组(改为使用 SET 而不是 SELECT):
-- =============================================
-- Computes and returns the Levenshtein edit distance between two strings, i.e. the
-- number of insertion, deletion, and sustitution edits required to transform one
-- string to the other, or NULL if @max is exceeded. Comparisons use the case-
-- sensitivity configured in SQL Server (case-insensitive by default).
--
-- Based on Sten Hjelmqvist's "Fast, memory efficient" algorithm, described
-- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm,
-- with some additional optimizations.
-- =============================================
CREATE FUNCTION [db0].[Levenshtein](
@s nvarchar(4000)
, @t nvarchar(4000)
, @max int
)
RETURNS int
WITH SCHEMABINDING
AS
BEGIN
DECLARE @distance int = 0 -- return variable
, @v0 nvarchar(4000)-- running scratchpad for storing computed distances
, @start int = 1 -- index (1 based) of first non-matching character between the two string
, @i int, @j int -- loop counters: i for s string and j for t string
, @diag int -- distance in cell diagonally above and left if we were using an m by n matrix
, @left int -- distance in cell to the left if we were using an m by n matrix
, @sChar nchar -- character at index i from s string
, @thisJ int -- temporary storage of @j to allow SELECT combining
, @jOffset int -- offset used to calculate starting value for j loop
, @jEnd int -- ending value for j loop (stopping point for processing a column)
-- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore)
, @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string
, @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string
, @lenDiff int -- difference in length between the two strings
-- if strings of different lengths, ensure shorter string is in s. This can result in a little
-- faster speed by spending more time spinning just the inner loop during the main processing.
IF (@sLen > @tLen) BEGIN
SET @v0 = @s
SET @i = @sLen -- temporarily use v0 for swap
SET @s = @t
SET @sLen = @tLen
SET @t = @v0
SET @tLen = @i
END
SET @max = ISNULL(@max, @tLen)
SET @lenDiff = @tLen - @sLen
IF @lenDiff > @max RETURN NULL
-- suffix common to both strings can be ignored
WHILE(@sLen > 0 AND SUBSTRING(@s, @sLen, 1) = SUBSTRING(@t, @tLen, 1))
SET @sLen = @sLen - 1
SET @tLen = @tLen - 1
IF (@sLen = 0) RETURN @tLen
-- prefix common to both strings can be ignored
WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1))
SET @start = @start + 1
IF (@start > 1) BEGIN
SET @sLen = @sLen - (@start - 1)
SET @tLen = @tLen - (@start - 1)
-- if all of shorter string matches prefix and/or suffix of longer string, then
-- edit distance is just the delete of additional characters present in longer string
IF (@sLen <= 0) RETURN @tLen
SET @s = SUBSTRING(@s, @start, @sLen)
SET @t = SUBSTRING(@t, @start, @tLen)
END
-- initialize v0 array of distances
SET @v0 = ''
SET @j = 1
WHILE (@j <= @tLen) BEGIN
SET @v0 = @v0 + NCHAR(CASE WHEN @j > @max THEN @max ELSE @j END)
SET @j = @j + 1
END
SET @jOffset = @max - @lenDiff
SET @i = 1
WHILE (@i <= @sLen) BEGIN
SET @distance = @i
SET @diag = @i - 1
SET @sChar = SUBSTRING(@s, @i, 1)
-- no need to look beyond window of upper left diagonal (@i) + @max cells
-- and the lower right diagonal (@i - @lenDiff) - @max cells
SET @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END
SET @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END
WHILE (@j <= @jEnd) BEGIN
-- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix)
SET @left = UNICODE(SUBSTRING(@v0, @j, 1))
SET @thisJ = @j
SET @distance =
CASE WHEN (@sChar = SUBSTRING(@t, @j, 1)) THEN @diag --match, no change
ELSE 1 + CASE WHEN @diag < @left AND @diag < @distance THEN @diag --substitution
WHEN @left < @distance THEN @left -- insertion
ELSE @distance -- deletion
END
END
SET @v0 = STUFF(@v0, @thisJ, 1, NCHAR(@distance))
SET @diag = @left
SET @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff)
then @jEnd + 2
else @thisJ + 1 end
END
SET @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END
END
RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END
END