考虑一个名为EmployeeName
table的列Employee
。目标是根据EmployeeName
字段删除重复记录。
EmployeeName
------------
Anand
Anand
Anil
Dipak
Anil
Dipak
Dipak
Anil
使用一个查询,我想删除重复的记录。
如何在 SQL Server 中使用 TSQL 完成此操作?
考虑一个名为EmployeeName
table的列Employee
。目标是根据EmployeeName
字段删除重复记录。
EmployeeName
------------
Anand
Anand
Anil
Dipak
Anil
Dipak
Dipak
Anil
使用一个查询,我想删除重复的记录。
如何在 SQL Server 中使用 TSQL 完成此操作?
您可以使用窗口函数来做到这一点。它将按 empId 对欺骗进行排序,并删除除第一个之外的所有内容。
delete x from (
select *, rn=row_number() over (partition by EmployeeName order by empId)
from Employee
) x
where rn > 1;
将其作为选择运行以查看将被删除的内容:
select *
from (
select *, rn=row_number() over (partition by EmployeeName order by empId)
from Employee
) x
where rn > 1;
假设您的 Employee 表也有一个唯一的列(ID
在下面的示例中),以下将起作用:
delete from Employee
where ID not in
(
select min(ID)
from Employee
group by EmployeeName
);
这将保留表中 ID 最低的版本。
编辑
Re McGyver 的评论 - 自SQL 2012 起
MIN
可以与 numeric、char、varchar、uniqueidentifier 或 datetime 列一起使用,但不能与 bit 列一起使用
对于2008 R2及更早版本,
MIN 可以与数字、char、varchar 或 datetime 列一起使用,但不能与位列一起使用(它也不适用于 GUID)
对于 2008R2,您需要将GUID
转换为 支持的类型MIN
,例如
delete from GuidEmployees
where CAST(ID AS binary(16)) not in
(
select min(CAST(ID AS binary(16)))
from GuidEmployees
group by EmployeeName
);
您可以尝试以下方法:
delete T1
from MyTable T1, MyTable T2
where T1.dupField = T2.dupField
and T1.uniqueField > T2.uniqueField
(这假设您有一个基于整数的唯一字段)
就个人而言,尽管我会说您最好尝试纠正在数据库发生之前将重复条目添加到数据库中的事实,而不是作为后期修复操作。
WITH CTE AS
(
SELECT EmployeeName,
ROW_NUMBER() OVER(PARTITION BY EmployeeName ORDER BY EmployeeName) AS R
FROM employee_table
)
DELETE CTE WHERE R > 1;
公用表表达式的魔力。
DELETE
FROM MyTable
WHERE ID NOT IN (
SELECT MAX(ID)
FROM MyTable
GROUP BY DuplicateColumn1, DuplicateColumn2, DuplicateColumn3)
WITH TempUsers (FirstName, LastName, duplicateRecordCount)
AS
(
SELECT FirstName, LastName,
ROW_NUMBER() OVER (PARTITIONBY FirstName, LastName ORDERBY FirstName) AS duplicateRecordCount
FROM dbo.Users
)
DELETE
FROM TempUsers
WHERE duplicateRecordCount > 1
尝试
DELETE
FROM employee
WHERE rowid NOT IN (SELECT MAX(rowid) FROM employee
GROUP BY EmployeeName);
如果您正在寻找一种删除重复项的方法,但您有一个指向具有重复项的表的外键,则可以使用缓慢但有效的游标采取以下方法。
它将重新定位外键表上的重复键。
create table #properOlvChangeCodes(
id int not null,
name nvarchar(max) not null
)
DECLARE @name VARCHAR(MAX);
DECLARE @id INT;
DECLARE @newid INT;
DECLARE @oldid INT;
DECLARE OLVTRCCursor CURSOR FOR SELECT id, name FROM Sales_OrderLineVersionChangeReasonCode;
OPEN OLVTRCCursor;
FETCH NEXT FROM OLVTRCCursor INTO @id, @name;
WHILE @@FETCH_STATUS = 0
BEGIN
-- determine if it should be replaced (is already in temptable with name)
if(exists(select * from #properOlvChangeCodes where Name=@name)) begin
-- if it is, finds its id
Select top 1 @newid = id
from Sales_OrderLineVersionChangeReasonCode
where Name = @name
-- replace terminationreasoncodeid in olv for the new terminationreasoncodeid
update Sales_OrderLineVersion set ChangeReasonCodeId = @newid where ChangeReasonCodeId = @id
-- delete the record from the terminationreasoncode
delete from Sales_OrderLineVersionChangeReasonCode where Id = @id
end else begin
-- insert into temp table if new
insert into #properOlvChangeCodes(Id, name)
values(@id, @name)
end
FETCH NEXT FROM OLVTRCCursor INTO @id, @name;
END;
CLOSE OLVTRCCursor;
DEALLOCATE OLVTRCCursor;
drop table #properOlvChangeCodes
delete from person
where ID not in
(
select t.id from
(select min(ID) as id from person
group by email
) as t
);
请参阅下面的删除方式。
Declare @Employee table (EmployeeName varchar(10))
Insert into @Employee values
('Anand'),('Anand'),('Anil'),('Dipak'),
('Anil'),('Dipak'),('Dipak'),('Anil')
Select * from @Employee
创建了一个名为的示例表@Employee
并使用给定的数据加载它。
Delete aliasName from (
Select *,
ROW_NUMBER() over (Partition by EmployeeName order by EmployeeName) as rowNumber
From @Employee) aliasName
Where rowNumber > 1
Select * from @Employee
结果:
我知道,这是六年前提出的问题,张贴以防万一它对任何人都有帮助。
这是一种很好的方法,可以根据您可以在运行时定义的所需主键,对具有标识列的表中的记录进行重复数据删除。在开始之前,我将使用以下代码填充示例数据集以使用:
if exists (select 1 from sys.all_objects where type='u' and name='_original')
drop table _original
declare @startyear int = 2017
declare @endyear int = 2018
declare @iterator int = 1
declare @income money = cast((SELECT round(RAND()*(5000-4990)+4990 , 2)) as money)
declare @salesrepid int = cast(floor(rand()*(9100-9000)+9000) as varchar(4))
create table #original (rowid int identity, monthyear varchar(max), salesrepid int, sale money)
while @iterator<=50000 begin
insert #original
select (Select cast(floor(rand()*(@endyear-@startyear)+@startyear) as varchar(4))+'-'+ cast(floor(rand()*(13-1)+1) as varchar(2)) ), @salesrepid , @income
set @salesrepid = cast(floor(rand()*(9100-9000)+9000) as varchar(4))
set @income = cast((SELECT round(RAND()*(5000-4990)+4990 , 2)) as money)
set @iterator=@iterator+1
end
update #original
set monthyear=replace(monthyear, '-', '-0') where len(monthyear)=6
select * into _original from #original
接下来我将创建一个名为 ColumnNames 的类型:
create type ColumnNames AS table
(Columnnames varchar(max))
最后,我将创建一个存储过程,其中包含以下 3 个注意事项: 1. 过程将采用一个必需的参数 @tablename,该参数定义您要从数据库中删除的表的名称。2. proc 有一个可选参数@columns,您可以使用它来定义构成您要删除的所需主键的字段。如果此字段留空,则假定除标识列之外的所有字段都构成所需的主键。3. 删除重复记录时,将保留其标识列中值最低的记录。
这是我的 delete_dupes 存储过程:
create proc delete_dupes (@tablename varchar(max), @columns columnnames readonly)
as
begin
declare @table table (iterator int, name varchar(max), is_identity int)
declare @tablepartition table (idx int identity, type varchar(max), value varchar(max))
declare @partitionby varchar(max)
declare @iterator int= 1
if exists (select 1 from @columns) begin
declare @columns1 table (iterator int, columnnames varchar(max))
insert @columns1
select 1, columnnames from @columns
set @partitionby = (select distinct
substring((Select ', '+t1.columnnames
From @columns1 t1
Where T1.iterator = T2.iterator
ORDER BY T1.iterator
For XML PATH ('')),2, 1000) partition
From @columns1 T2 )
end
insert @table
select 1, a.name, is_identity from sys.all_columns a join sys.all_objects b on a.object_id=b.object_id
where b.name = @tablename
declare @identity varchar(max)= (select name from @table where is_identity=1)
while @iterator>=0 begin
insert @tablepartition
Select distinct case when @iterator=1 then 'order by' else 'over (partition by' end ,
substring((Select ', '+t1.name
From @table t1
Where T1.iterator = T2.iterator and is_identity=@iterator
ORDER BY T1.iterator
For XML PATH ('')),2, 5000) partition
From @table T2
set @iterator=@iterator-1
end
declare @originalpartition varchar(max)
if @partitionby is null begin
select @originalpartition = replace(b.value+','+a.type+a.value ,'over (partition by','') from @tablepartition a cross join @tablepartition b where a.idx=2 and b.idx=1
select @partitionby = a.type+a.value+' '+b.type+a.value+','+b.value+') rownum' from @tablepartition a cross join @tablepartition b where a.idx=2 and b.idx=1
end
else
begin
select @originalpartition=b.value +','+ @partitionby from @tablepartition a cross join @tablepartition b where a.idx=2 and b.idx=1
set @partitionby = (select 'OVER (partition by'+ @partitionby + ' ORDER BY'+ @partitionby + ','+b.value +') rownum'
from @tablepartition a cross join @tablepartition b where a.idx=2 and b.idx=1)
end
exec('select row_number() ' + @partitionby +', '+@originalpartition+' into ##temp from '+ @tablename+'')
exec(
'delete a from _original a
left join ##temp b on a.'+@identity+'=b.'+@identity+' and rownum=1
where b.rownum is null')
drop table ##temp
end
完成此操作后,您可以通过运行 proc 删除所有重复记录。要在不定义所需主键的情况下删除重复项,请使用以下调用:
exec delete_dupes '_original'
要根据定义的所需主键删除重复项,请使用以下调用:
declare @table1 as columnnames
insert @table1
values ('salesrepid'),('sale')
exec delete_dupes '_original' , @table1