我不得不使用建议的各种技术。感谢为我指明正确的方向。希望以下内容可以帮助其他人解决此类问题。
去除多余的字符
CREATE FUNCTION [dbo].[fn_StripCharacters]
(
@String NVARCHAR(MAX),
@MatchExpression VARCHAR(255)
)
RETURNS NVARCHAR(MAX)
AS
BEGIN
SET @MatchExpression = '%['+@MatchExpression+']%'
WHILE PatIndex(@MatchExpression, @String) > 0
SET @String = Stuff(@String, PatIndex(@MatchExpression, @String), 1, '')
RETURN @String
END
用法:
--remove all non-alphanumeric and non-white space
dbo.fn_StripCharacters(@Value, , '^a-z^0-9 ')
将名称拆分为多个部分
CREATE FUNCTION [dbo].[SplitTable] (@sep char(1), @sList StringList READONLY)
RETURNS @ResultList TABLE
(
[ID] VARCHAR(MAX),
[Val] VARCHAR(MAX)
)
AS
BEGIN
declare @OuterCursor cursor
declare @ID varchar(max)
declare @Val varchar(max)
set @OuterCursor = cursor fast_forward for (SELECT * FROM @sList) FOR READ ONLY
open @OuterCursor
fetch next from @OuterCursor into @ID, @Val
while (@@FETCH_STATUS=0)
begin
INSERT INTO @ResultList (ID, Val)
select @ID, split.s from dbo.Split(@sep, @Val) as split
where len(split.s) > 0
fetch next from @OuterCursor into @ID, @Val
end
close @OuterCursor
deallocate @OuterCursor
CREATE FUNCTION [dbo].[Split] (@sep char(1), @s varchar(8000))
RETURNS table
AS
RETURN (
WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(@sep, @s)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(@sep, @s, stop + 1)
FROM Pieces
WHERE stop > 0
)
SELECT pn,
LTRIM(RTRIM(SUBSTRING(@s, start,
CASE WHEN stop > 0
THEN stop-start
ELSE 8000
END))) AS s
FROM Pieces
)
RETURN
用法:
--create split name list
DECLARE @NameList StringList
INSERT INTO @NameList (ID, Val)
SELECT id, firstname FROM dbo.[User] u
WHERE PATINDEX('%[^a-z]%', u.FirstName) > 0
----remove split dups
select u.ID, COUNT(*)
from dbo.import_SplitTable(' ', @NameList) splitList
INNER JOIN dbo.[User] u
ON splitList.id = u.id
常用昵称:
我根据这个列表创建了一个表,并用它来加入通用名称等价物。
用法:
SELECT u.id
, u.FirstName
, u_nickname_maybe.Name AS MaybeNickname
, u.LastName
, c.ID AS ContactID from
FROM dbo.[User] u
INNER JOIN nickname u_nickname_match
ON u.FirstName = u_nickname_match.Name
INNER JOIN nickname u_nickname_maybe
ON u_nickname_match.relatedid = u_nickname_maybe.id
LEFT OUTER JOIN
(
SELECT c.id, c.LastName, c.FirstName,
c_nickname_maybe.Name AS MaybeFirstName
FROM dbo.Contact c
INNER JOIN nickname c_nickname_match
ON c.FirstName = c_nickname_match.Name
INNER JOIN nickname c_nickname_maybe
ON c_nickname_match.relatedid = c_nickname_maybe.id
WHERE c_nickname_match.Name <> c_nickname_maybe.Name
) as c
ON c.AccountHolderID = ah.ID
AND u_nickname_maybe.Name = c.MaybeFirstName AND u.LastName = c.LastName
WHERE u_nickname_match.Name <> u_nickname_maybe.Name
语音算法(Jaro Winkler):
令人惊叹的文章Beyond SoundEx - 在 MS SQL Server 中进行模糊搜索的函数,展示了如何在 SQL Server 中安装和使用SimMetrics库。该库可让您找到字符串之间的相对相似性,并包含许多算法。我最终主要使用Jaro Winkler来匹配名称。
用法:
SELECT
u.id AS UserID
,c.id AS ContactID
,u.FirstName
,c.FirstName
,u.LastName
,c.LastName
,maxResult.CombinedScores
from
(
SELECT
u.ID
,
max(
dbo.JaroWinkler(lower(u.FirstName), lower(c.FirstName))
* dbo.JaroWinkler(LOWER(u.LastName), LOWER(c.LastName))
) AS CombinedScores
FROM dbo.[User] u, dbo.[Contact] c
WHERE u.ContactID IS NULL
GROUP BY u.id
) AS maxResult
INNER JOIN dbo.[User] u
ON maxResult.id = u.id
INNER JOIN dbo.[Contact] c
ON maxResult.CombinedScores =
dbo.JaroWinkler(lower(u.FirstName), lower(c.FirstName))
* dbo.JaroWinkler(LOWER(u.LastName), LOWER(c.LastName))