0

是否可以在 SQL Server 中比较类似数据的行?我在表中有一个公司名称列,其中公司名称可能有些相似。以下是代表相同 4 家公司的不同 8 个值的示例:

ANDORRA WOODS   
ANDORRA WOODS HEALTHCARE CENTER
ABC HEALTHCARE, JOB #31181
ABC HEALTHCARE, JOB #31251
ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT
ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY
APEX SYSTEMS
APEX SYSTEMS, INC

我现在清理它的方法是使用谷歌优化,在那里我可以识别相似数据值的集群并将它们全部整合在一起。使用此示例,我只需要 4 个名称而不是 8 个名称,因此我需要用一个替换类似的名称,因为稍后我将为这些名称分配索引。任何帮助是极大的赞赏。

4

2 回答 2

1

我有几个 UDF 是我前段时间从一些 VB 代码转换而来的,它接受 2 个 varchar() 并返回一个介于 0 和 100 之间的 int(0= 不相似,100= 相同),如果您感兴趣的话。

-- Description: Removes any special characters from a string
CREATE FUNCTION [dbo].[SimReplaceSpecial] 
(
    -- Add the parameters for the function here
    @String varchar(max)
)
RETURNS varchar(max)
AS
BEGIN
    -- Declare the return variable here
    DECLARE @Result varchar(max) = ''

    -- Add the T-SQL statements to compute the return value here
    DECLARE @Pos int = 1
    DECLARE @Asc int
    DECLARE @WorkingString varchar(max)

    SET @WorkingString = upper(@String)

    WHILE @Pos <= LEN(@WorkingString)
    BEGIN
        SET @Asc = ascii(substring(@WorkingString,@Pos,1))
        If (@Asc >= 48 And @Asc <= 57) Or (@Asc >= 65 And @Asc <= 90)
                SET @Result = @Result + Char(@Asc)

        SET @Pos = @Pos + 1
        --IF @Pos + 1 > len(@String)
        --  BREAK
        --ELSE
        --  CONTINUE
    END

    -- Return the result of the function
    RETURN @Result
END

-- Description: DO NOT CALL DIRECTLY - Used by the Similar function
--   Finds longest common substring (other than single
--   characters) in String1 and String2, then recursively
--   finds longest common substring in left-hand
--   portion and right-hand portion. Updates the
--   cumulative score.
CREATE FUNCTION [dbo].[SimFindCommon] 
(
    -- Add the parameters for the function here
    @String1 varchar(max),
    @String2 varchar(max),
    @Score int
)
RETURNS int
AS
BEGIN
    -- Declare the return variable here
    --DECLARE @Result int

    DECLARE @Longest Int = 0
    DECLARE @StartPos1 Int = 0
    DECLARE @StartPos2 Int = 0
    DECLARE @J Int = 0

    DECLARE @HoldStr varchar(max)
    DECLARE @TestStr varchar(max)
    DECLARE @LeftStr1 varchar(max) = ''
    DECLARE @LeftStr2 varchar(max) = ''
    DECLARE @RightStr1 varchar(max) = ''
    DECLARE @RightStr2 varchar(max) = ''

    -- Add the T-SQL statements to compute the return value here
    SET @HoldStr = @String2
    WHILE LEN(@HoldStr) > @Longest 
    BEGIN
        SET @TestStr = @HoldStr
        WHILE LEN(@TestStr) > 1
        BEGIN
                SET @J = CHARINDEX(@TestStr,@String1)
            If @J > 0
            BEGIN
                --Test string is sub-set of the other string
                If Len(@TestStr) > @Longest 
                BEGIN
                    --Test string is longer than previous
                    --longest. Store its length and position.
                    SET @Longest = Len(@TestStr)
                    SET @StartPos1 = @J
                    SET @StartPos2 = CHARINDEX(@TestStr,@String2)
                END
                --No point in going further with this string
                BREAK
            END
            ELSE
                --Test string is not a sub-set of the other
                --string. Discard final character of test
                --string and try again.
                SET @TestStr = Left(@TestStr, LEN(@TestStr) - 1)
        END 
        --Now discard first char of test string and
        --repeat the process.
        SET @HoldStr = Right(@HoldStr, LEN(@HoldStr) - 1)
    END 

    --Update the cumulative score with the length of
    --the common sub-string.
    SET @Score = @Score + @Longest

    --We now have the longest common sub-string, so we
    --can isolate the sub-strings to the left and right
    --of it.

    If @StartPos1 > 3 And @StartPos2 > 3 
    BEGIN
        SET @LeftStr1 = Left(@String1, @StartPos1 - 1)
        SET @LeftStr2 = Left(@String2, @StartPos2 - 1)
        If RTRIM(LTRIM(@LeftStr1)) <> '' And RTRIM(LTRIM(@LeftStr2)) <> ''
        BEGIN
            --Get longest common substring from left strings
            SET @Score = dbo.SimFindCommon(@LeftStr1, @LeftStr2,@Score)
        END
    END
    ELSE
    BEGIN
        SET @LeftStr1 = ''
        SET @LeftStr2 = ''
    END
    If @Longest > 0
    BEGIN
        SET @RightStr1 = substring(@String1, @StartPos1 + @Longest, LEN(@String1))
        SET @RightStr2 = substring(@String2, @StartPos2 + @Longest, LEN(@String2))

        If RTRIM(LTRIM(@RightStr1)) <> '' And RTRIM(LTRIM(@RightStr2)) <> ''
        BEGIN
            --Get longest common substring from right strings
            SET @Score = dbo.SimFindCommon(@RightStr1, @RightStr2,@Score)
        END
    END
    ELSE
    BEGIN
        SET @RightStr1 = ''
        SET @RightStr2 = ''
    END

    -- Return the result of the function
    RETURN @Score
END

-- Description: Compares two not-empty strings regardless of case.
--  Returns a numeric indication of their similarity 
--  (0 = not at all similar, 100 = identical)
CREATE FUNCTION [dbo].[Similar] 
(
    -- Add the parameters for the function here
    @String1 varchar(max),
    @String2 varchar(max)
)
RETURNS int
AS
BEGIN
    -- Declare the return variable here
    DECLARE @Result int
    DECLARE @WorkingString1 varchar(max)
    DECLARE @WorkingString2 varchar(max)

    -- Add the T-SQL statements to compute the return value here
    if isnull(@String1,'') = '' or isnull(@String2,'') = ''
        SET @Result = 0
    ELSE 
    BEGIN
        --Convert each string to simplest form (letters
        --and digits only, all upper case)
        SET @WorkingString1 = dbo.SimReplaceSpecial(@String1)
        SET @WorkingString2 = dbo.SimReplaceSpecial(@String2)

        If RTRIM(LTRIM(@WorkingString1)) = '' Or RTRIM(LTRIM(@WorkingString2)) = ''
        BEGIN
            --One or both of the strings is now empty
            SET @Result =  0
        END
        ELSE
        BEGIN
            If @WorkingString1 = @WorkingString2
            BEGIN
                --Strings are identical
                SET @Result =  100
            END
            ELSE
            BEGIN
                --Find all common sub-strings
                SET @Result = dbo.SimFindCommon(@WorkingString1, @WorkingString2,0)

                --We now have the cumulative score. Return this
                --as a percent of the maximum score. The maximum
                --score is the average length of the two strings.
                SET @Result = @Result * 200 / (Len(@WorkingString1) + Len(@WorkingString2))
            END
        END
    END

    -- Return the result of the function
    RETURN @Result

END



--Usage--------------------------------------------------------------------
--Call the "Similar" Function only
SELECT dbo.Similar('ANDORRA WOODS','ANDORRA WOODS HEALTHCARE CENTER')
--Result = 60
SELECT dbo.Similar('ABC HEALTHCARE, JOB #31181','ABC HEALTHCARE, JOB #31251')
--Result = 85
SELECT dbo.Similar('ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT','ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY')
--Result = 92
SELECT dbo.Similar('APEX SYSTEMS','APEX SYSTEMS, INC')
--Result = 88
于 2013-09-16T20:53:48.510 回答
0

SSIS/数据工具有一个模糊分组转换,在这种情况下非常有用。它实际上并没有对您的数据进行分组,而是为您提供相似性分数,您可以使用这些分数来确定何时应将项目分组在一起。

那里有很多教程,这里有一个: 模糊分组转换

于 2013-09-16T20:44:32.857 回答