1

我有一个数据表如下

#data
---------------
Account AccountType
---------------
1       2
2       0
3       5
4       2
5       1
6       5

AccountType 2 是标题,5 是总计。这意味着类型 2 的帐户必须查看下一个 1 或 0 以确定其 Dim 值是 1 还是 0。类型 5 的总计必须查找最接近的 1 或 0 以确定其 Dim 值。类型 1 或 0 的帐户的类型为 Dim。

类型 2 的帐户显示为孤岛,因此仅检查 RowNumber + 1 是不够的,类型 5 的帐户也是如此。

我已经使用 CTE 得出了下表。但是找不到从这里到所有帐户的 Account、AccountType、Dim 的最终结果的快速方法

T3
-------------------
StartRow  EndRow AccountType  Dim
-------------------
1           1         2        0
2           2         0        0
3           3         5        0
4           4         2        1
5           5         0        1
6           6         5        1

下面的代码是 MS TSQL 复制粘贴它并查看它运行。CTE 选择语句的最终连接对于 500 行来说非常慢,需要 30 秒。我有 100.000 行需要处理。我做了一个基于游标的解决方案,可以在 10-20 秒内完成,这是可行的,而快速递归 CTE 解决方案可以在 5 秒内完成 100.000 行,但它取决于#data 表的碎片。我应该补充一点,这是简化的,真正的问题需要考虑的维度更多。但是对于这个简单的问题,它的工作原理是一样的。

无论如何,有没有一种使用连接或其他基于集合的解决方案的快速方法来做到这一点。

SET NOCOUNT ON

IF OBJECT_ID('tempdb..#data') IS NOT NULL
    DROP TABLE #data

CREATE TABLE #data
(
Account INTEGER IDENTITY(1,1),
AccountType INTEGER,
)

BEGIN -- TEST DATA
DECLARE @Counter INTEGER = 0
DECLARE @MaxDataRows INTEGER = 50 -- Change here to check performance
DECLARE @Type INTEGER
    WHILE(@Counter < @MaxDataRows)
    BEGIN
    SET @Type = CASE 
        WHEN @Counter % 10 < 3 THEN 2 
        WHEN @Counter % 10 >= 8 THEN 5 
        WHEN @Counter % 10 >= 3 THEN (CASE WHEN @Counter < @MaxDataRows / 2.0 THEN 0 ELSE 1 END )
        ELSE 0 
        END
    INSERT INTO #data VALUES(@Type)
    SET @Counter = @Counter + 1
    END
END -- TEST DATA END



;WITH groupIds_cte AS
(
    SELECT *,
    ROW_NUMBER() OVER (PARTITION BY AccountType ORDER BY Account) - Account AS GroupId  
    FROM #data
),

islandRanges_cte AS
(
SELECT
    MIN(Account) AS StartRow,
    MAX(Account) AS EndRow,
    AccountType
FROM groupIds_cte
GROUP BY GroupId,AccountType
),

T3 AS
(
SELECT I.*, J.AccountType AS Dim
FROM islandRanges_cte I
INNER JOIN islandRanges_cte J
ON (I.EndRow + 1 = J.StartRow AND I.AccountType = 2)
UNION ALL
SELECT I.*, J.AccountType AS Dim
FROM islandRanges_cte I
INNER JOIN islandRanges_cte J
ON (I.StartRow - 1 = J.EndRow AND I.AccountType = 5)
UNION ALL
SELECT *, AccountType AS Dim
FROM islandRanges_cte
WHERE AccountType = 0 OR AccountType = 1
),

T4 AS 
(
SELECT Account, Dim
    FROM (
    SELECT FlattenRow AS Account, StartRow, EndRow, Dim
    FROM T3 I   
    CROSS APPLY (VALUES(StartRow),(EndRow)) newValues (FlattenRow)
    ) T
)

--SELECT * FROM T3 ORDER BY StartRow
--SELECT * FROM T4 ORDER BY Account

-- Final correct result but very very slow
SELECT D.Account, D.AccountType, I.Dim FROM T3 I
INNER JOIN #data D
ON D.Account BETWEEN I.StartRow AND I.EndRow
ORDER BY Account

编辑一些时间测试

SET NOCOUNT ON

IF OBJECT_ID('tempdb..#data') IS NULL
CREATE TABLE #times
(
RecId INTEGER IDENTITY(1,1),
Batch INTEGER,
Method NVARCHAR(255),
MethodDescription NVARCHAR(255),
RunTime INTEGER
)

IF OBJECT_ID('tempdb..#batch') IS NULL
CREATE TABLE #batch 
(
Batch INTEGER IDENTITY(1,1),
Bit BIT
)

INSERT INTO #batch VALUES(0)

IF OBJECT_ID('tempdb..#data') IS NOT NULL
    DROP TABLE #data

CREATE TABLE #data
(
Account INTEGER
)

CREATE NONCLUSTERED INDEX data_account_index ON #data (Account)

IF OBJECT_ID('tempdb..#islands') IS NOT NULL
    DROP TABLE #islands

CREATE TABLE #islands
(
AccountFrom INTEGER ,
AccountTo INTEGER,
Dim INTEGER,
)

CREATE NONCLUSTERED INDEX islands_from_index ON #islands (AccountFrom, AccountTo, Dim)

BEGIN -- TEST DATA
    INSERT INTO #data
    SELECT TOP 100000 ROW_NUMBER() OVER(ORDER BY t1.number) AS N
    FROM master..spt_values t1 
    CROSS JOIN master..spt_values t2

    INSERT INTO #islands
    SELECT MIN(Account) AS Start, MAX(Account), Grp
    FROM (SELECT *, NTILE(10) OVER (ORDER BY Account) AS Grp FROM #data) T
    GROUP BY Grp ORDER BY Start
END -- TEST DATA END

--SELECT * FROM #data
--SELECT * FROM #islands

--PRINT CONVERT(varchar(20),DATEDIFF(MS,@RunDate,GETDATE()))+' ms Sub Query'
DECLARE @RunDate datetime
SET @RunDate=GETDATE()

SELECT Account, (SELECT Dim From #islands WHERE Account BETWEEN AccountFrom AND AccountTo) AS Dim
FROM #data

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'subquery','',DATEDIFF(MS,@RunDate,GETDATE()))
SET @RunDate=GETDATE()

SELECT D.Account, V.Dim
FROM #data D
CROSS APPLY
(
SELECT Dim From #islands V
WHERE D.Account BETWEEN V.AccountFrom AND V.AccountTo
) V

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'crossapply','',DATEDIFF(MS,@RunDate,GETDATE()))
SET @RunDate=GETDATE()

SELECT D.Account, I.Dim 
FROM #data D
JOIN #islands I
ON D.Account BETWEEN I.AccountFrom AND I.AccountTo

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'join','',DATEDIFF(MS,@RunDate,GETDATE()))
SET @RunDate=GETDATE()

;WITH cte AS
(
SELECT Account, AccountFrom, AccountTo, Dim, 1 AS Counting
FROM #islands
CROSS APPLY (VALUES(AccountFrom),(AccountTo)) V (Account)
UNION ALL
SELECT Account + 1 ,AccountFrom, AccountTo, Dim, Counting + 1
FROM cte
WHERE (Account + 1) > AccountFrom AND (Account + 1) < AccountTo
)
SELECT Account, Dim, Counting FROM cte OPTION(MAXRECURSION 32767)

INSERT INTO #times VALUES ((SELECT MAX(Batch) FROM #batch) ,'recursivecte','',DATEDIFF(MS,@RunDate,GETDATE()))

您可以从 #times 表中进行选择以查看运行时间 :)

4

1 回答 1

0

我认为您想要加入,但使用不等式而不是等式:

select tt.id, tt.dim1, it.dim2
from TallyTable tt join
     IslandsTable it
     on tt.id between it."from" and it."to"

这适用于您在问题中提供的数据。

这是另一个可能有效的想法。这是查询:

select d.*,
       (select top 1 AccountType from #data d2 where d2.Account > d.Account and d2.AccountType not in (2, 5)
       ) nextAccountType
from #data d 
order by d.account;

我刚刚在 50,000 行上运行了这个,这个版本在我的系统上花了 17 秒。将表更改为:

CREATE TABLE #data (
    Account INTEGER IDENTITY(1,1) primary key,
    AccountType INTEGER,
);

实际上已经把它减慢到大约 1:33 —— 令我惊讶的是。也许其中之一会对您有所帮助。

于 2013-05-22T13:42:17.753 回答