sql - 删除重复最简单的方法/解释

Question

首先，我想说的是，我（作为新手）确实搜索了几个关于表格中重复项的问答，但不幸的是，我无法操纵用作答案的代码。

我的表是由在 SQL Server 2008 中排序的报表组成的。

我想知道如何删除重复记录并附上解释。

"MyTable":

Column1   (PK-auto incremental table's record ID) 
Column2   (some TXT) 
Column3   (Some TXT)
Column4   (SmallDateTime)
Column5   is empty

Column5 将保存的值SUM(count of deleted duplicates including this survived row)

在可能的情况下，解决方案的关键是如果[column2 and column3]有多个具有相同内容的记录（因此重复），但它们并不总是共享相同的日期（column4）。

由此：

col1  col2   col3  col4         col5
----  -----  ----  -----------  ----
1     [abc]  [4]   [10/1/2012]  null
2     [abc]  [1]   [12/1/2012]  null
3     [ghi]  [6]   [4/1/2012]   null
4     [def]  [5]   [8/1/2012]   null
5     [abc]  [4]   [10/1/2012]  null
6     [def]  [5]   [12/1/2012]  null
7     [ghi]  [6]   [15/1/2012]  null
8     [abc]  [4]   [17/1/2012]  null
9     [ghi]  [6]   [6/1/2012]   null
10    [abc]  [1]   [13/1/2012]  null

进入这个：

col1  col2   col3  col4         col5
----  -----  ----  -----------  ----
8     [abc]  [4]   [17/1/2012]  2
10    [abc]  [1]   [13/1/2012]  3
6     [def]  [5]   [12/1/2012]  2
7     [ghi]  [6]   [15/1/2012]  3

意思是保留最新的 (1) 作为每个重复记录的表示。

++重新编辑++

Aaron Bertrand shawnt00 e2nburner... 和你们其他人我不能说我多么感谢你的回复，尽管我还没有理解那大量的代码。我现在要检查这些代码，但不是 b4，谢谢你们！

当我第一次开始编程并需要 sql 查询时，使用后

Select * From MyTable

...我的第一个 SQL 语句 ...

我说嘿，我知道 SQL ！！！....现在...看看你们对你们的深入了解...非常感谢我知道 StackOverFlow 中的这篇文章对其他初学者也将进一步有用

score 2 · Accepted Answer

此答案使用通用表表达式将row_number()和 count() 应用于数据的每个“切片”（意思是按 col2 + col3 分组）。count() 用于识别每个这样的组有多少行，row_number() 用于应用由 col4 desc 排序的“排名”（1 = 每组最新，2 = 第二个最新等）。这也使用 col1 （看起来像一个独特的列）来打破任何关系。CTE 后面可以跟一个查询，例如选择、更新、删除等。因此您可以运行第一个选择来验证这些是您想要保留的行，并且计数是否正确。如果是，那么您可以继续进行更新和删除。您会注意到，在所有情况下，row_number() 输出都用于标识您保留的行或丢弃的行。

要确定要保留的行：

;WITH n AS 
(
  SELECT col1, col2, col3, col4, 
    c = COUNT(*) OVER (PARTITION BY col2, col3),
    rn = ROW_NUMBER() OVER 
    (
      PARTITION BY col2, col3 ORDER BY col4 DESC, col1 DESC
    )
  FROM dbo.table_name
)
SELECT col1, col2, col3, col4, c
  FROM n WHERE rn = 1;

一旦你确认这些是你想要保留的行，你可以像这样更新它们：

;WITH n AS 
(
  SELECT col1, col2, col3, col4, col5, 
    c = COUNT(*) OVER (PARTITION BY col2, col3),
    rn = ROW_NUMBER() OVER 
    (
      PARTITION BY col2, col3 ORDER BY col4 DESC, col1 DESC
    )
  FROM dbo.table_name
)
UPDATE n SET col5 = c
  WHERE rn = 1;

然后以这种方式删除剩余部分：

;WITH n AS 
(
  SELECT col1, col2, col3, col4, 
    rn = ROW_NUMBER() OVER 
    (
      PARTITION BY col2, col3 ORDER BY col4 DESC, col1 DESC
    )
  FROM dbo.table_name
)
DELETE n WHERE rn > 1;

或者更简单（假设 col5 在更新之前完全为空）：

DELETE dbo.table_name WHERE col5 IS NULL;

score 1 · Accepted Answer

这是一种简单化的方法。你可能会发现merge更好。这些版本保留最高 col1 值并修改 maxdate 列。Aaron's 保留具有 maxdate 的行。我怀疑这是一个重要但应该注意的区别。

update MyTable
set col4 = (
    select max(col4)
    from MyTable as m2
    where m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
),  col5 = (
    select count(*)
    from MyTable as m2
    where m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
)
where not exists (
    select *
    from MyTable as m2
    where
        m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
        and m2.col1 > MyTable.col1
        and m2.col4 > MyTable.col4 or m2.col4 = MyTable.col4 and m2.col1 > MyTable.col1
);

delete from MyTable
where exists (
    select *
    from MyTable as m2
    where
        m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
        and m2.col1 > MyTable.col1
);

编辑 2这是我的merge查询

merge MyTable as target
using (
    select max(col1), col2, col3, max(col4), count(*)
    from Mytable
    group by col2, col3
) as source(id, col2, col3, maxdate, rowcount)
on (
        target.col1 = source.col1
    and target.col2 = target.col2
    and target.col3 = target.col3
)
when matched then
    update set col4 = maxdate, col5 = rowcount
when not matched then delete

编辑 3保留原始 maxdate 的行，打破 col1 的关系

-- option #1
update MyTable
set col5 = (
    select count(*)
    from MyTable as m2
    where m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
)
where not exists (
    select *
    from MyTable as m2
    where
        m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
        and m2.col4 > MyTable.col4 or m2.col4 = MyTable.col4 and m2.col1 > MyTable.col1
);

delete from MyTable
where exists (
    select *
    from MyTable as m2
    where
        m2.col2 = MyTable.col2 and m2.col3 = MyTable.col3
        and m2.col4 > MyTable.col4 or m2.col4 = MyTable.col4 and m2.col1 > MyTable.col1
);

-- option #2
merge MyTable as target
using (
    select max(col1), col2, col3, max(col4), count(*)
    from Mytable
    group by col2, col3
) as source(maxid, col2, col3, maxdate, rowcount)
on (
        target.col2 = target.col2
    and target.col3 = target.col3
    and target.col1 = maxid
    and target.col4 = maxdate
)
when matched then
    update set col5 = rowcount
when not matched then delete

score 0 · Accepted Answer

WITH a AS (
    SELECT  *,
            ROW_NUMBER() OVER (PARTITION BY colum2 ORDER BY colum3 desc) RowNum
    FROM    mytable
)
-- deleted rows will be:

delete from mytable
where [yourID] in

(SELECT [yourID]

FROM    a
WHERE   a.RowNum <> 1 )

sql - 删除重复最简单的方法/解释

3 回答 3

Related

Reference