我有一组数据需要选择最佳匹配。每条记录都有一个 Name 和 CNum。具有相同“名称”的每条记录应该具有相同的“CNum”;实际上,一些“名称”匹配具有相同的 CNum,而有些则没有(这是要解决的问题)。我需要确定哪个 CNum 更好,并使用单个 CNum 更新所有“名称”匹配项。
我已使用 ParentId 列更新了表格以显示匹配的名称,并使用 SubParentId 标记匹配的“Name”和“CNum”以帮助提取所需的结果(并使其更容易查看匹配项)。
为了帮助确定“名称”组中哪个 CNum 比另一个更好,每条记录都在两列中评分:“ScoreA”和“ScoreB”;分数越低越好。以下是我用来确定哪个 CNum 最好的规则:
- 如果名称组中的所有记录(相同的 ParentId)具有相同的 CNum(相同的 SubParentId),则不执行任何操作
- 如果它们在同一个 Name 组中的 CNum 不同,则选择具有单个最低 ScoreA 的记录的 id 并将组的 parentId 更新为选定的 id
- 如果没有单个 ScoreA 匹配,则使用具有单个最低 ScoreB 的记录的 id 更新每个组的 parentId
- 如果没有单个 ScoreB 匹配,并且它是不同区域之间的平局,并且其中只有一个具有“AB”区域,则使用区域为“AB”的记录的 id 更新组的每个 parentId
- 如果仍然不匹配或多个“AB”区域(具有不同的 CNum 和并列分数),则为该“名称”组中的每条记录设置 NoBestMatch=1
假设:如果他们有相同的 Name 和 CNum 他们的分数将是相同的
有没有一种好方法可以应用上述规则来获得我正在寻找的结果?
这是我正在寻找的数据样本和结果,其中插入语句旁边注明了预期的获胜结果:
-- create table
CREATE TABLE Results
(
Id INT NOT NULL IDENTITY( 1, 1 ) PRIMARY KEY,
Name VARCHAR(200) NULL,
CNum NVARCHAR(100) NULL,
Region NVARCHAR(3) NULL,
ScoreA INT NULL,
ScoreB INT NULL,
ParentId INT NULL,
SubParentId INT NULL,
NoMatch BIT NOT NULL DEFAULT(0)
)
GO
-- insert data
-- Leave as is: they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
-- Acme Co: winner noted below --> best ScoreA
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '269415003', 'AB', '-13460', '-23' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )
-- Zuland Ltd: winner noted below --> best ScoreB
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-30' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-23' )
-- Emco Inc: winner noted below --> AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5695003', 'ON', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5695003', 'AB', '-668', '13' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )
-- Zemco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Zemco Inc', '995588', 'WY', '-668', '13' )
-- Texco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '234JJJ', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '555552', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '234JJJ', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Texco Inc', '555552', 'WY', '-668', '13' )
-- Grasslands: Leave as is --> they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
-- Mike Inc: No Match --> more than 1 'AB' with tied scores
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '555552', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '555552222', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
VALUES ( 'Mike Inc', '90210', 'KT', '-668', '13' )
GO
-- set parent id matched on Name
UPDATE r
SET r.ParentId = COALESCE( r1.Id, r.Id )
FROM Results r
LEFT JOIN Results r1
ON r.Name = r1.NAME
GO
-- set sub-parent id matched on Name and CNum
UPDATE r
SET r.SubParentId = COALESCE( r1.Id, r.Id )
FROM Results r
LEFT JOIN Results r1
ON r.Name = r1.Name AND
r.CNum = r1.CNum
GO