1

我有两个数据表,其中包含如下数据:-

|  id   | name |   dob    |          | name |   dob    |
|-------|------|----------|          |------|----------|  
| 12345 | ABC  | 20010301 |          | ABC  | 20010301 |  - matching record
| 45678 | DEF  | 20010425 |          | XYZ  | 20010301 |  - unmatched record

是否可以编写一个比较两个表的查询,然后创建一个匹配和不匹配的表,而只保留原始表结构/数据?

  Match Table        Unmatched Table
|  id   | rank |     |  id   | rank |   
|-------|------|     |-------|------|
| 12345 |  1   |     | 45678 | NULL |

我正在尝试使用 MERGE,但我必须插入/更新一个源表,并且在 tsql 方面我已经达到了上限——我还将处理超过 30,000,000 行的数据集——有什么建议/建议吗?
我目前的sql(字段不匹配但原理有)如下?

Create TABLE #Cohort ([ID] varchar(4),[match rank] int)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('aaaa',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('bbbb',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('cccc',NULL)
INSERT INTO #Cohort ([ID],[match rank]) VALUES('dddd',NULL)

Create TABLE #link ([ID] varchar(4),[match rank] int)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)
INSERT INTO #link ([ID],[match rank]) VALUES('aaaa',NULL)
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL)

Create TABLE #Matches ([ID] varchar(4),[match rank] int)
Create TABLE #Unmatched ([ID] varchar(4),[match rank] int)

MERGE #Cohort tg
USING (SELECT distinct c.[ID], 1 as [match rank] 
        from #Cohort c
        INNER JOIN #link as h on c.[ID]=h.[ID]) sc
ON (tg.[ID] = sc.[ID] )
WHEN NOT MATCHED BY TARGET
     THEN INSERT([ID],[match rank]) VALUES(sc.[ID],sc.[match rank])
WHEN NOT MATCHED BY SOURCE
     THEN DELETE
OUTPUT Deleted.* INTO #Unmatched;
4

3 回答 3

0

查找匹配/不匹配记录的标准方法是执行左连接并在左连接表中查找 NULL。

SELECT t1.id, COUNT(t2.name) AS rank
INTO #MatchedTable
FROM Table1 t1
LEFT JOIN Table2 t2 ON t2.name = t1.name
WHERE t2.name IS NOT NULL
GROUP BY t1.id
ORDER BY t1.id

和:

SELECT t1.id, NULL AS rank
INTO #UnmatchedTable
FROM Table1 t1
LEFT JOIN Table2 t2 ON t2.name = t1.name
WHERE t2.name IS NULL
GROUP BY t1.id
ORDER BY t1.id

我希望这有帮助。

于 2013-05-07T15:44:40.960 回答
0

使用 CTE,最后您将拥有#Matched匹配的行和#Unmatched不匹配的行。就目前而言,您的MERGE语句从#cohort 表中删除行,只留下具有aaaa值的行。

CREATE TABLE #Cohort ([ID] VARCHAR(4),[MATCH RANK] INT)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('aaaa',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('bbbb',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('cccc',NULL)
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('dddd',NULL)

CREATE TABLE #link ([ID] VARCHAR(4),[MATCH RANK] INT)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES('aaaa',NULL)
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL)

CREATE TABLE #Matches ([ID] VARCHAR(4),[MATCH RANK] INT)
CREATE TABLE #Unmatched ([ID] VARCHAR(4),[MATCH RANK] INT)

;WITH MatchedTbl AS
(
   SELECT DISTINCT c.[ID], c.[MATCH RANK] 
     FROM #Cohort c
     INNER JOIN #link h ON c.[ID] = h.[ID]
)
INSERT INTO #Matches
SELECT c.[ID], c.[MATCH RANK]
  FROM MatchedTbl c

;WITH NonMatchedTbl AS
(
    SELECT DISTINCT l.[ID], l.[MATCH RANK] 
      FROM #link l 
     WHERE l.ID NOT IN (SELECT DISTINCT ID FROM #cohort)
)
INSERT INTO #Unmatched
SELECT [ID], [MATCH RANK]
  FROM NonMatchedTbl

SELECT * FROM #Cohort
SELECT * FROM #Link
SELECT * FROM #Matches
SELECT * FROM #Unmatched

DROP TABLE #Cohort
DROP TABLE #link
DROP TABLE #Matches
DROP TABLE #Unmatched
于 2013-05-07T15:45:46.793 回答
0

如果您正在处理海量数据,您可以尝试两件事。如果您仍想使用合并语句,您可以尝试在 BATCHES 中执行此操作,而不是一次全部执行。或者您可以分配批次并进行直接插入。无论哪种方式,我都会建议一个可能的暂存区域,创建一个索引,然后进行插入。分配具有 ntile 功能的批次。下面的自解压示例将在 SQL Server 2008 或更高版本中运行:

declare @Person Table ( personID int identity, person varchar(8));

insert into @Person values ('Brett'),('Sean'),('Chad'),('Michael'),('Ray'),('Erik'),('Quyen'),('John'),('Tim');

declare @Orders table ( OrderID int identity, PersonID int, Desciption varchar(32), Amount int);

insert into @Orders values (1, 'Shirt', 20),(1, 'Shoes', 50),(2, 'Shirt', 22),(2, 'Shoes', 52),(3, 'Shirt', 20),(3, 'Shoes', 50),(3, 'Hat', 20),(4, 'Shirt', 20),(5, 'Shirt', 20),(5, 'Pants', 30),
(6, 'Shirt', 20),(6, 'RunningShoes', 70),(7, 'Shirt', 22),(7, 'Shoes', 40),(7, 'Coat', 80)

declare @Storage table ( batch int, personid int, person varchar(8), orderid int, Desciption varchar(32), amount int);

insert into @Storage

Select 
    ntile(5) over(order by p.PersonID)  
-- ntile does the number n inside across entire dataset so if I had 500 items 100 would each be different batch
,   p.personID
,   p.person
,   o.OrderID
,   o.Desciption
,   o.Amount
from @Person p
    left join @Orders o on p.personID = o.PersonID
-- left join assures that when orders do not exist I still get the person

declare @Cursor int = 5
-- I can set a cursor for inserts based on batching.

-- pretend tables for matching
declare @Matched table ( personid int, person varchar(8), orderid int, Desciption varchar(32), amount int);
declare @UnMatched table ( personid int, person varchar(8), orderid int, description varchar(32), amount int);


insert into @Matched
select
    personID
,   person
,   OrderID
,   Desciption
,   Amount
from @Storage
where batch = @Cursor
and orderID is not null


insert into @UnMatched
select 
    personID
,   person
,   OrderID
,   Desciption
,   Amount
from @Storage
where batch = @Cursor
and orderID is null

select * From @Matched
select * From @UnMatched

我的示例非常简单,但您可以更改“光标”变量以查看暂存时会出现的不同结果。由于批处理,我不会一次运行整个集合,我可以将数据放入存储中,然后编写一个过程来根据会改变的游标或整数进行插入。您甚至可以添加一列以供参考数据是否已处理。

于 2013-05-07T15:55:22.393 回答