我在这个问题上花费的时间比我愿意承认的要多。我有一个实现游标的解决方案(如下),但我想知道其他方法是否可行?对我来说,这尤其困难,因为 SQL 中没有常见的构造,例如数组。
这似乎也是使用递归的一个好问题,但我无法弄清楚。该平台是 MSSQL 2008 或 T-SQL。
考虑一个具有两列非唯一标识符和一个日期列的表。对于每个日期,我想将一列 (X) 中的标识符合并或分组到第二列 (Y) 中的标识符是唯一的集合中。
- X 标识符是非唯一的
- Y 标识符在每组 X 标识符中是唯一的,但总体上不唯一
- 组合时,使用最小可能的 X 标识符
- X 标识符不会跨越多个日期
也许最好的起点是一些样本数据。解决方案中还有一些扩展的示例数据。在我使用它的实际实现中,通常少于 200 行,通常少于 100 行。
Dt X Y newX
6/1/2012 1 1 1
6/1/2012 1 2 1
6/1/2012 2 3 1
6/1/2012 3 1 3 <-- because Y=1 is already in X=1
6/1/2012 3 4 3
6/1/2012 4 5 1
6/1/2012 5 4 1 <-- Y=4 is in X=3 but not X=1
6/1/2012 5 6 1
6/1/2012 6 4 6 <-- Y=4 is in X=1 and X=3
6/1/2012 6 7 6
解决方案...
-- task: combine/condense/reassign/coalesce/collapse/consolidate sets of X identifiers into groups with unique Y identifiers, and by date
-- - X identifiers are non-unique
-- - Y identifiers are unique within each set of X identifiers, but non-unique overall
-- - When combining, the minimum possible X identifier is used
-- - An X identifier will not span more than a single date
--drop table #tmpA
CREATE TABLE #tmpA (Dt DATETIME, X INT, Y INT, newX INT, origX INT)
-- sample data
-- Dt X Y newX
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 1, 2, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 1, 1, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 2, 5, 0, 2)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 3, 2, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 3, 3, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 4, 3, 0, 4)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 5, 5, 0, 5)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 6, 5, 0, 6)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 7, 2, 0, 7)
INSERT INTO #tmpA VALUES (CAST('2012-06-01' AS DATETIME), 7, 1, 0, 7) -- causes a debug 4
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 1, 2, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 1, 1, 0, 1)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 2, 5, 0, 2)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 3, 2, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 3, 3, 0, 3)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 4, 3, 0, 4)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 5, 5, 0, 5)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 6, 5, 0, 6)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 7, 0, 7)
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 1, 0, 7) -- causes a debug 3 if below not used
--INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 6, 0, 7) -- causes a debug 8 if above not used
INSERT INTO #tmpA VALUES (CAST('2012-06-02' AS DATETIME), 7, 5, 0, 7)
DECLARE @X INT
DECLARE @tX INT -- temporary X
DECLARE @Y INT
DECLARE @Dt DATETIME
DECLARE @tDt DATETIME = CAST('1900-01-01' AS DATETIME) -- temporary date
DECLARE @newX INT
DECLARE @min_X INT -- minimum X without Y duplicate
DECLARE @min_newX INT
DECLARE CursorA CURSOR FOR SELECT Dt, X, Y, newX FROM #tmpA
ORDER BY Dt, X
OPEN CursorA
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
SET @tX = @X -- initialize for change in X detection
WHILE (@@FETCH_STATUS = 0)
BEGIN
-- a change in X?
IF (@tX != @X)
BEGIN
-- change in X, update all prior X to their newX (which should all be the same)
UPDATE #tmpA SET X = newX WHERE Dt = @tDt AND X = @tX
select 1 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
SET @tX = @X
END
IF (@newX != 0)
BEGIN
-- newX for this X and Y already assigned, move on
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
IF (@Dt != @tDt)
BEGIN
-- date change
SET @tDt = @Dt
-- all for this first X are simply the same identifier
UPDATE #tmpA SET newX = @X WHERE Dt = @Dt AND X = @X
select 2 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
-- still on same date
-- is there any duplicate Y already assigned a newX?
SELECT @min_X = MIN(X) FROM #tmpA
WHERE Dt = @Dt AND X != @X AND Y = @Y AND newX != 0
IF @min_X IS NOT NULL
BEGIN
-- there is a Y duplicate within this date
-- find the earliest X which does not have a duplicate Y
SELECT @min_newX = MIN(X) FROM #tmpA
WHERE Dt = @Dt AND X != @X AND Y != @Y AND newX != 0
AND X NOT IN (SELECT X FROM #tmpA
WHERE Dt = @Dt AND X != @X AND Y = @Y AND newX != 0)
IF @min_newX IS NOT NULL
BEGIN
-- is there an "earlier" X already assigned a newX?
SELECT @min_X = MIN(newX) FROM #tmpA
WHERE Dt = @Dt AND X = @X AND newX !=0
IF @min_X IS NOT NULL
BEGIN
-- there is another X already assigned a newX
IF @min_newX >= @min_X
BEGIN
-- set the other one to this one
UPDATE #tmpA SET newX = @min_newX
WHERE Dt = @Dt AND X = @X AND newX = @min_X
UPDATE #tmpA SET newX = @min_newX
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 3 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
END
ELSE
BEGIN
UPDATE #tmpA SET newX = @min_X
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 4 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
END
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
--
UPDATE #tmpA SET newX = @min_newX
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 5 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
-- no other X without a duplicate Y already assigned a newX so assign this entire X set to itself
UPDATE #tmpA SET newX = @X WHERE Dt = @Dt AND X = @X
select 6 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
-- no other Y but it's possible that another newX for this X is set to something different
SELECT @min_newX = MIN(newX) FROM #tmpA
WHERE Dt = @Dt AND X = @X AND Y != @Y AND newX != 0
-- also find the min X for this Y
SELECT @min_X = MIN(X) FROM #tmpA
WHERE Dt = @Dt AND X != @X AND Y != @Y AND newX != 0
IF @min_newX IS NULL
BEGIN
-- no other Y for this X is assigned so set it to the minimum X already found
UPDATE #tmpA SET newX = @min_X
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 7 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
-- there is another of the same X with a newX
IF (@min_X = @min_newX OR @min_X > @min_newX OR @min_newX IS NULL)
BEGIN
-- there is a different Y for this X which has already been assigned the same newX as this one should be
-- or a later one was found
UPDATE #tmpA SET newX = @min_X
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 8 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
CONTINUE
END
UPDATE #tmpA SET newX = @min_newX
WHERE Dt = @Dt AND X = @X AND Y = @Y
select 9 as debug, @tX as tX, @min_X as minX, @min_newX as minR, @X as X, @Y as Y, @newX as newX
FETCH NEXT FROM CursorA INTO @Dt, @X, @Y, @newX
END
-- gotta catch the last set
UPDATE #tmpA SET X = newX WHERE Dt = @Dt AND X = @tX
SELECT * FROM #tmpA
-- ORDER BY Dt, X, Y
CLOSE CursorA
DEALLOCATE CursorA
RETURN
这是输出的样子......
Dt X Y newX origX
2012-06-01 00:00:00.000 1 2 1 1
2012-06-01 00:00:00.000 1 1 1 1
2012-06-01 00:00:00.000 1 5 1 2
2012-06-01 00:00:00.000 3 2 3 3
2012-06-01 00:00:00.000 3 3 3 3
2012-06-01 00:00:00.000 1 3 1 4
2012-06-01 00:00:00.000 3 5 3 5
2012-06-01 00:00:00.000 6 5 6 6
2012-06-01 00:00:00.000 6 2 6 7
2012-06-01 00:00:00.000 6 1 6 7
2012-06-02 00:00:00.000 1 2 1 1
2012-06-02 00:00:00.000 1 1 1 1
2012-06-02 00:00:00.000 1 5 1 2
2012-06-02 00:00:00.000 3 2 3 3
2012-06-02 00:00:00.000 3 3 3 3
2012-06-02 00:00:00.000 1 3 1 4
2012-06-02 00:00:00.000 3 5 3 5
2012-06-02 00:00:00.000 6 5 6 6
2012-06-02 00:00:00.000 7 7 7 7
2012-06-02 00:00:00.000 7 1 7 7
2012-06-02 00:00:00.000 7 5 7 7