sql - 借助日期字段应用滑动窗口功能

Question

我正在尝试在一天内计算不同的 ID，这很简单，但是如何从窗口中的当前日期开始计算它们 7 天，在当前窗口中返回 30 天。

我从当前行向后滑动 7/30 行，但我认为它可能会多于/少于 7/30 天并不准确，因为我认为它只是按行数滑动，所以你有什么想法处理这个？

--mockup data SnowSQL
with test  (date,id,a,b,c,d) as(
select $1 as date, $2 as id, $3 as a, $4 as b, $5 as c, $6 as d
from values 
('2019-07-15','x_1','ps','e','US','NA'),
('2019-07-15','x_2','ps','e','US','NA'),
('2019-07-15','x_2','ps','e','CA','NA'),
('2019-07-16','x_2','ps','e','CA','NA'),
('2019-07-16','x_3','c','xb','CH','AS'),
('2019-07-17','x_4','ps','e','US','NA'),
('2019-07-17','x_5','c','ps4','CH','AS'),
('2019-07-17','x_6','c','ps4','CH','AS'),
('2019-07-17','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-19','x_8','c','ps','CH','AS'),
('2019-07-20','x_8','c','ps','CH','AS'),
('2019-07-21','x_8','c','ps','CH','AS'),
('2019-07-22','x_8','c','ps','CH','AS'),
('2019-07-29','x_8','c','ps','CH','AS'),
('2019-08-01','x_8','c','ps','CH','AS'),
('2019-08-02','x_9','c','ps','CH','AS'),
('2019-08-03','y_1','c','ps','CH','AS'),
('2019-08-04','y_8','c','ps','CH','AS'),
('2019-08-05','z_8','c','ps','CH','AS'),
('2019-08-06','a_8','c','ps','CH','AS'),
('2019-08-07','b_8','c','ps','CH','AS'),
('2019-08-07','c_8','c','ns','CH','AS'),
('2019-08-07','e_8','ps','e','US','NA'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-10','gx_8','c','ps','CH','AS'),
('2019-08-11','v_8','c','ps','CH','AS') )
--subquery
select 1 as part,
(
  select sum(x)
  from (
select count(distinct id) as x
from test
where
 date = '2019-07-16'
group by a,b,c,d) ) as dau,
(
  select sum(x)
  from (
select count(distinct id) as x
from test
where
 date between '2019-07-16'::date-interval '7 days' and '2019-07-16'
//    <= '2019-07-15'
//and date >= '2019-07-15'::date-interval '7 days'
group by a,b,c,d)) as w,
(select sum(x)
  from (
select count(distinct id) as x
from test
where
 date <= '2019-07-16'
and date >= '2019-07-16'::date-interval '30 days'
group by a,b,c,d)) as m
union
--window function
select 2,sum(dau),sum(w),sum(m)
from(
select 
date,
a,
b,
c,
d,
count(distinct id) over (Partition by date,a,b,c,d Order by date)as dau,
count(distinct id) over (Partition by a,b,c,d Order by date rows between 7 preceding  and current row) as w,
count(distinct id) over (Partition by a,b,c,d Order by date rows between 30 preceding  and current row) as m
from test
group by
date,
a,
b,
c,
d,
id)
where date='2019-07-16'
group by date
;

第一部分是在子查询的时间范围内进行计数

第二部分是使用滑动功能进行计数

预期结果：两者都应该在相同的时间范围内获得相同的计数。

score 0 · Accepted Answer

如果我正确理解了这个问题，您可以像这样在窗口函数中使用范围参数（为简单起见，我将您的表值转储到临时表中）：

select distinct [date], id
      from #test
order by [date] desc

select [date], 
        count(*) over (order by date desc rows between current row and 7 following ) count_7_day,
        count(*) over (order by date desc rows between current row and 30 following ) count_30_day
from (select distinct [date], id
      from #test )x
order by [date] desc

希望这有帮助？

score 0 · Accepted Answer

带有表格中的日期

create table  test as(
select $1 as date, $2 as id, $3 as a, $4 as b, $5 as c, $6 as d
from values 
('2019-07-15','x_1','ps','e','US','NA'),
('2019-07-15','x_2','ps','e','US','NA'),
('2019-07-15','x_2','ps','e','CA','NA'),
('2019-07-16','x_2','ps','e','CA','NA'),
('2019-07-16','x_3','c','xb','CH','AS'),
('2019-07-17','x_4','ps','e','US','NA'),
('2019-07-17','x_5','c','ps4','CH','AS'),
('2019-07-17','x_6','c','ps4','CH','AS'),
('2019-07-17','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-18','x_7','c','ns','CH','AS'),
('2019-07-19','x_8','c','ps','CH','AS'),
('2019-07-20','x_8','c','ps','CH','AS'),
('2019-07-21','x_8','c','ps','CH','AS'),
('2019-07-22','x_8','c','ps','CH','AS'),
('2019-07-29','x_8','c','ps','CH','AS'),
('2019-08-01','x_8','c','ps','CH','AS'),
('2019-08-02','x_9','c','ps','CH','AS'),
('2019-08-03','y_1','c','ps','CH','AS'),
('2019-08-04','y_8','c','ps','CH','AS'),
('2019-08-05','z_8','c','ps','CH','AS'),
('2019-08-06','a_8','c','ps','CH','AS'),
('2019-08-07','b_8','c','ps','CH','AS'),
('2019-08-07','c_8','c','ns','CH','AS'),
('2019-08-07','e_8','ps','e','US','NA'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-08','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-09','f_8','c','xb','CH','AS'),
('2019-08-10','gx_8','c','ps','CH','AS'),
('2019-08-11','v_8','c','ps','CH','AS') );

我过去这样做的方式是，我想将不同的日期绘制为行，因此将结果显示为不同的行，以允许按天列的颜色来解决这个问题：

with distinct_data as (
    select distinct date, id
    from test
), days as ( -- set of days we want to range over
    select column1 as days from (values (1),(7),(30))
), windows_ends as ( -- last 60 days of date
    select dateadd('day',-SEQ8(),current_date) as win_date_end
    from table(generator(rowcount => 60))
), windows as (
    select d.days
        ,w.win_date_end
        ,dateadd('day',-d.days, w.win_date_end) as win_date_start
    from days as d
    join windows_ends as w
)
select w.days
    ,w.win_date_start
    ,w.win_date_end
    ,count(distinct(d.id)) as c_id
from distinct_data as d
join windows as w on d.date > w.win_date_Start and d.date <= w.win_date_end
group by 1,2,3
order by 3,1;

对于我的结果，我删除了 win_date_start 在我的数据开始之前的行，所以我不会得到数据集开始后 2 天开始的 30 天。

以上允许动态天数（我发现这非常有用），但在具有三个固定结果的情况下，我只是添加了一个支点，因此..

with distinct_data as (
    select distinct date, id
    from test
), days as ( -- set of days we want to range over
    select column1 as days from (values (1),(7),(30))
), windows_ends as ( -- last 60 days of date
    select dateadd('day',-SEQ8(),current_date) as win_date_end
    from table(generator(rowcount => 60))
), windows as (
    select d.days
        ,w.win_date_end
        ,dateadd('day',-d.days, w.win_date_end) as win_date_start
    from days as d
    join windows_ends as w
), manyrows as (
  select w.days
      ,w.win_date_end
      ,count(distinct(d.id)) as c_id
  from distinct_data as d
  join windows as w on d.date > w.win_date_Start and d.date <= w.win_date_end
  group by 1,2
)
select win_date_end as date, "1" as d1, "7" as d7, "30" as d30
from manyrows
   pivot(sum(c_id) for days in (1,7,30)) as p
order by win_date_end;

但是您可以看到结果如何对最后一个日期产生拖尾效应......我倾向于过滤掉......

这也需要从现在开始更改 60 过滤器

考虑到您固定的 30 天窗口，我觉得这可以更有效地完成，就像这样

with distinct_data as (
    select distinct date
        ,id
    from test
), distinct_dates as (
    select distinct date
        ,dateadd('day',-30,date) as d30_s
    from distinct_data
), mixed_data as (
select d.date
    ,b.id
    ,datediff('days',b.date, d.date) as d_diff
    ,iff(d_diff < 1, id, NULL) as d1_id
    ,iff(d_diff < 7, id, NULL) as d7_id
    ,iff(d_diff < 30, id, NULL) as d30_id
from distinct_dates as d
join distinct_data as b on b.date >= d.d30_s and b.date <= d.date
)
select date
    ,count(distinct(d1_id)) as d1
    ,count(distinct(d7_id)) as d7
    ,count(distinct(d30_id)) as d30
from mixed_data
group by 1 order by 1;

但这仅给出了当前天数的窗口总和，因此前两种方法为 2019-7-23 给出了 7 天和 30 天，因为后者没有那一天..所以这取决于您需要数据的方式。

score 0 · Accepted Answer

如果我们使用当前行，除非您的案例只有 1 行，否则我们将不会获得例外输出，因此如果该功能可用，我们应该使用按范围滑动。如果不使用日期维度加入您的数据并为滑动窗口填充数据并使用案例语句来获取您的窗口（如果您有多个窗口）。现在，由于我们拥有所需的数据，因此在案例上使用常规窗口函数并对其应用计数。

sql - 借助日期字段应用滑动窗口功能

3 回答 3

Related

Reference