sql - 在一组重叠的、版本编号的区间中，找到每个时间点的最新版本

Question

我正在使用一组日期间隔，其中每个间隔都有一个版本号，新间隔将经常与旧间隔重叠，甚至是它们的子集。根据这些数据，我需要计算一组新的间隔，以显示每个时间点的最新版本号。这个问题有基于集合的解决方案吗？

这是一个插图：

Interval 1: 11111111111111111111111      
Interval 2:     2222222222               
Interval 3:   33333333333333             
Interval 4:                     444444444
Interval 5:                   555555555  
Result    : 11333333333333331155555555544

这是我正在使用的数据示例：

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   1/1/2011    1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2012  12/31/2012  6
1         10/1/2012  11/1/2012   8

...以及所需的输出：

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   10/1/2010   1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2011  10/1/2012   6
1         10/1/2012  11/1/2012   8 << note how version 8 supersedes version 6
1         11/1/2012  12/31/2012  6 << version 6 is split into two records

我还没有找到任何其他关于这个问题的例子，我的谷歌搜索只找到了识别间隙和岛屿或覆盖集的查询。

我想我有一个迭代解决方案（SQL Server 2008）。它从结果集中间隔的临时表开始，并通过插入具有特殊版本号的记录来定义我们想要覆盖的范围的起点和终点。然后，它反复识别结果集间隔之间的间隙，并尝试用原始数据集中的最新记录填充它们，直到没有更多的间隙或没有更多的记录要添加：

GO
-- Create data set and results table
CREATE TABLE #Data (
     groupId    INT
    ,startDate  DATE
    ,endDate    DATE
    ,versionId  INT
)

INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)


CREATE TABLE #Results (
     groupId        VARCHAR(10)
    ,startDate  DATE
    ,endDate    DATE
    ,versionId      BIGINT
)

DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20121231'
SET @placeholderId = 999999999999999

INSERT #Results
SELECT DISTINCT
     groupId
    ,CASE WHEN MIN(startDate) < @startDate THEN MIN(startDate) ELSE @startDate END
    ,CASE WHEN MIN(startDate) < @startDate THEN @startDate ELSE MIN(startDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
     groupId
    ,CASE WHEN MAX(endDate) < @endDate THEN MAX(endDate) ELSE @endDate END
    ,CASE WHEN MAX(endDate) < @endDate THEN @endDate ELSE MAX(endDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
GO

-- Fill gaps in results table
DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20111231'
SET @placeholderId = 999999999999999

DECLARE @counter INT
SET @counter = 0

WHILE @counter < 10
BEGIN
    SET @counter = @counter + 1;
    WITH Gaps AS (
        SELECT
             gs.groupId
            ,gs.startDate
            ,MIN(ge.endDate) as endDate
            ,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
        FROM (
            SELECT groupId, endDate as startDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.startDate <= r1.endDate
                        AND r2.endDate > r1.endDate
                )
                AND NOT (endDate >= @endDate AND versionId = @placeholderId)
        ) gs
        INNER JOIN (
            SELECT groupId, startDate as endDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.endDate >= r1.startDate
                        AND r2.startDate < r1.startDate
                )
                AND NOT (startDate <= @startDate AND versionId = @placeholderId)
        ) ge
            ON ge.groupId = gs.groupId
            AND ge.endDate >= gs.startDate
        GROUP BY gs.groupId, gs.startDate
    )
    INSERT #Results (
         groupId
        ,startDate
        ,endDate
        ,versionId
    )
    SELECT
         d.groupId
        ,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
        ,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
        ,d.versionId
    FROM #Data d
    INNER JOIN Gaps g
        ON g.groupId = d.groupId
        AND g.startDate <= d.endDate
        AND g.endDate >= d.startDate
    INNER JOIN (
        SELECT 
             d.groupId
            ,gapId
            ,MAX(d.versionId) as versionId
        FROM #Data d
        INNER JOIN Gaps g
            ON g.groupId = d.groupId
            AND g.startDate <= d.endDate
            AND g.endDate >= d.startDate
        WHERE d.versionId < (
                SELECT MIN(versionId)
                FROM #Results r
                WHERE r.groupId = d.groupId
                    AND (r.startDate = g.endDate OR r.endDate = g.startDate)
            )
            AND NOT EXISTS (
                SELECT *
                FROM #Data dsup
                WHERE dsup.groupId = d.groupId
                    AND dsup.versionId > d.versionId
                    AND dsup.startDate <= d.startDate
                    AND dsup.endDate >= d.endDate
            )
        GROUP BY
             d.groupId
            ,g.gapId
    ) mg
        ON mg.groupId = g.groupId
        AND mg.gapId = g.gapId
        AND mg.versionId = d.versionId
END

SELECT *
FROM #Results
WHERE versionId <> @placeholderId
order by groupId, startDate

基于集合的解决方案会更有用，但我一直在努力寻找。有任何想法吗？

score 4 · Accepted Answer

-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
)
insert dbo.dates select * from dates;

-- for each date, determine the prevailing version
  select t.groupId, d.thedate, max(t.versionId) versionId
    into #tmp1
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;

-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);

-- find the start dates
;with t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;

当然，您可以在“一个查询”中完成所有操作，但这样做会带来风险，因为性能会大打折扣。

不要使用 - 仅用于学术兴趣 -

;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
), tmp1 as (
  select t.groupId, d.thedate, max(t.versionId) versionId
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;

score 1 · Accepted Answer

Updated due to some feedback from the comments. I'm not going to worry about the end cases that a few people have pointed out since they've been proven trivial to solve in other Answers, but I wanted to go ahead and get a working version out that didn't require DDL... I figure it's just good to have options. :-)

This code should work:

select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate)    as startDate from #Data) xy) ender on
    starter.groupId = ender.groupId and
    starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate

There are a couple of tiny caveats I had to do to get it into a single SQL statement. First, the max end date is not dynamic; I hard coded '2012-12-31'. You can replace it with a MAX(endDate), but you can't put that in the GROUP BY statement. If you can do this in a procedure, you can do:

select into @max_end_date MAX(endDate) from #Data

and replace '2012-12-31' with @max_end_date.

Second, I do not guarantee that two adjacent segments won't have the same value! This may or may not be important to you... that is, if you had the following:

Interval 1:       111111      
Interval 2:   22222222222222

Your output would be:

Interval 1:   2222
Interval 2:       2222222222

Still, I think it's worth hitting it in a simple and efficient SQL query. It may not be hard to fix those caveats, but it didn't matter to what I was working on, so I haven't bothered yet.

score 0 · Accepted Answer

如果结束日期很重要，也很重要，这里有一种方法可以做到。datetimes如果您的版本不仅仅是日期，此解决方案也可以适应工作。

首先是一堆函数

一个在给定日期获取版本

Create Function dbo.VersionAtDate(@GroupID int, @Date datetime) Returns int as
Begin
  Declare @Ret int = Null
  Select
    @Ret = Max(VersionID)
  From
    VersionedIntervals iv
  Where
    iv.GroupID = @GroupID And
    iv.StartDate <= @Date And
    iv.EndDate + 1 > @Date -- if dates were half open intervals this would just be iv.EndDate > @Date
  Return @Ret
End

接下来获取两个日期时间的中点（分钟分辨率）：

Create Function dbo.Midpoint(@Start datetime, @End datetime) Returns datetime as
Begin
  Return DateAdd(Minute, DateDiff(Minute, @Start, @End) / 2, @Start)
End

中间版本：

Create Function dbo.VersionAtMidpoint(@GroupID int, @Start datetime, @End datetime) returns int as
Begin
  Return dbo.VersionAtDate(@GroupID, dbo.Midpoint(@Start, @End))
End;

最后是一个表值函数来帮助解决一些点是一个范围的开始和另一个范围的结束这一事实，它有助于从一个输入中获取两行：

-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(@GroupID int, @RN bigint, @Start datetime, @End datetime, @Next datetime, @Version int)
Returns @EndPoints Table (
    GroupID int,
    RN bigint,
    Version int,
    StartDate datetime,
    EndDate datetime
) As
Begin
  Declare @NextVersion int, @VersionAtMidpoint int
  Set @NextVersion = dbo.VersionAtDate(@GroupID, @Next)
  If @NextVersion = @Version
    -- interval carries on
    Insert Into @EndPoints Select @GroupID, @RN, @Version, @Start, @Next
  Else
  Begin
    -- interval has ended
    Set @VersionAtMidpoint = dbo.VersionAtMidPoint(@GroupID, @End, @Next)
    If @VersionAtMidpoint != @Version
        -- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
        -- 3333333
        -- 44     
        Insert Into @EndPoints Select @GroupID, @RN, @VersionAtMidpoint, @End, @Next 
    Else
    Begin
        -- We have something like this, end the run of 3s and start the run of fours
        -- 33333
        --   444
        Insert Into @EndPoints Select @GroupID, -1, @Version, @Start, @Next
        Insert Into @EndPoints Select @GroupID, @RN, @NextVersion, @Next, @Next
    End
  End
  Return
End

有了所有这些机制，最后是递归 CTE plust 表变量，您需要适当地设置 maxrecursion ：

Declare @Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))

Insert Into
    @Bounds
Select
    GroupID,
    Row_Number() Over (Partition By GroupID Order By BoundDate),
    BoundDate
From (
    Select
        GroupID,
        StartDate As BoundDate
    From
        dbo.VersionedIntervals
    Union
    Select
        GroupID,
        EndDate
    From
        dbo.VersionedIntervals
    ) a

;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
    Select
        GroupID,
        RN,
        BoundDate,
        BoundDate,
        dbo.VersionAtDate(GroupID, BoundDate)
    From
        @Bounds
    Where
        RN = 1
    Union All
    Select
        e.GroupID,
        e.RN,
        e.StartDate,
        e.EndDate,
        e.Version
    From
        @Bounds b
            Inner Join
        VersionedBounds v
            On v.GroupID = b.GroupID And b.RN = v.RN + 1
            Cross Apply 
        dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select 
    GroupID,
    StartDate,
    Max(EndDate) As EndDate,
    Max(Version) As Version
From
    VersionedBounds
Group By
    GroupID,
    StartDate
Order By
    GroupID,
    StartDate

http://sqlfiddle.com/#!6/b95bd/2

sql - 在一组重叠的、版本编号的区间中，找到每个时间点的最新版本

3 回答 3

Related

Reference