0

我有一组数据需要选择最佳匹配。每条记录都有一个 Name 和 CNum。具有相同“名称”的每条记录应该具有相同的“CNum”;实际上,一些“名称”匹配具有相同的 CNum,而有些则没有(这是要解决的问题)。我需要确定哪个 CNum 更好,并使用单个 CNum 更新所有“名称”匹配项。

我已使用 ParentId 列更新了表格以显示匹配的名称,并使用 SubParentId 标记匹配的“Name”和“CNum”以帮助提取所需的结果(并使其更容易查看匹配项)。

为了帮助确定“名称”组中哪个 CNum 比另一个更好,每条记录都在两列中评分:“ScoreA”和“ScoreB”;分数越低越好。以下是我用来确定哪个 CNum 最好的规则:

  1. 如果名称组中的所有记录(相同的 ParentId)具有相同的 CNum(相同的 SubParentId),则不执行任何操作
  2. 如果它们在同一个 Name 组中的 CNum 不同,则选择具有单个最低 ScoreA 的记录的 id 并将组的 parentId 更新为选定的 id
  3. 如果没有单个 ScoreA 匹配,则使用具有单个最低 ScoreB 的记录的 id 更新每个组的 parentId
  4. 如果没有单个 ScoreB 匹配,并且它是不同区域之间的平局,并且其中只有一个具有“AB”区域,则使用区域为“AB”的记录的 id 更新组的每个 parentId
  5. 如果仍然不匹配或多个“AB”区域(具有不同的 CNum 和并列分数),则为该“名称”组中的每条记录设置 NoBestMatch=1

假设:如果他们有相同的 Name 和 CNum 他们的分数将是相同的

有没有一种好方法可以应用上述规则来获得我正在寻找的结果?

这是我正在寻找的数据样本和结果,其中插入语句旁边注明了预期的获胜结果:

-- create table
CREATE TABLE Results
(
    Id          INT NOT NULL IDENTITY( 1, 1 ) PRIMARY KEY,
    Name        VARCHAR(200) NULL,
    CNum        NVARCHAR(100) NULL,
    Region      NVARCHAR(3) NULL,
    ScoreA      INT NULL,
    ScoreB      INT NULL,
    ParentId    INT NULL,
    SubParentId INT NULL,
    NoMatch     BIT NOT NULL DEFAULT(0)
)
GO

-- insert data

-- Leave as is: they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
    VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )

--  Acme Co: winner noted below --> best ScoreA
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '269415003', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '269415003', 'AB', '-13460', '-23' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '5695003', 'AB', '-155', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Acme Co', '856545', 'AB', '-22', '16' )

--  Zuland Ltd: winner noted below --> best ScoreB
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )    
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-28' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zuland Ltd', '654543', 'AB', '-13455', '-23' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-30' )  -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zuland Ltd', '5603', 'ON', '-13455', '-23' )

--  Emco Inc: winner noted below --> AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '5695003', 'ON', '-668', '13' )    
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '5695003', 'AB', '-668', '13' ) -- Expected Winner
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Emco Inc', '995588', 'WY', '-668', '13' )

-- Zemco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )   
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
    VALUES ( 'Zemco Inc', '5695003', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Zemco Inc', '5545', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
    VALUES ( 'Zemco Inc', '995588', 'WY', '-668', '13' )

-- Texco Inc: No Winner --> No AB tie breaker
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Texco Inc', '234JJJ', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Texco Inc', '555552', 'TN', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Texco Inc', '234JJJ', 'CA', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Texco Inc', '555552', 'WY', '-668', '13' )

-- Grasslands: Leave as is --> they are all the same
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )    
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
    VALUES ( 'Grasslands', '91588', 'WY', '-668', '13' )

-- Mike Inc: No Match --> more than 1 'AB' with tied scores
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Mike Inc', '555552', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Mike Inc', '234JJJ', 'AB', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB )
    VALUES ( 'Mike Inc', '555552222', 'WY', '-668', '13' )
INSERT INTO Results ( Name, CNum, Region, ScoreA, ScoreB ) 
    VALUES ( 'Mike Inc', '90210', 'KT', '-668', '13' )

GO

-- set parent id matched on Name
UPDATE  r
SET     r.ParentId = COALESCE( r1.Id, r.Id )
FROM    Results r
LEFT JOIN Results r1
  ON    r.Name = r1.NAME
GO

-- set sub-parent id matched on Name and CNum
UPDATE  r
SET     r.SubParentId = COALESCE( r1.Id, r.Id )
FROM    Results r
LEFT JOIN Results r1
  ON    r.Name = r1.Name AND
        r.CNum = r1.CNum
GO
4

1 回答 1

1

所以对于你给出的规则,这就是我想出的。将来唯一可能不起作用的是区域规则(规则 4)不是“AB”。由于它首先是按字母顺序排列的区域,因此我可以使用下面的代码:

UPDATE  r2 
SET     [CNum] = A.[CNum]
FROM    (
            SELECT   [Id] ,
                    [Name] ,
                    [CNum] ,
                    [Region] ,
                    [ScoreA] ,
                    [ScoreB] ,
                    [ParentId] ,
                    [SubParentId] ,
                    [NoMatch],
                    ROW_NUMBER() OVER (PARTITION BY [Name] ORDER BY CAST([ScoreA] AS INT) ASC, CAST(ScoreB AS INT) ASC, [Region] ASC) AS RowNum 
            FROM    [dbo].[Results] AS r
        ) AS A
    INNER JOIN [dbo].[Results] AS r2 ON [r2].[Name] = [A].[Name] AND [r2].[Id] != [A].[Id]
WHERE [RowNum] = 1
于 2013-09-20T22:58:33.790 回答