我想为每个客户获取最后三个登录日期,并找到在上次登录 (login3) 和上次登录 (login1) 之前的时间超过 4 天的客户。
“活动”表包含:
- 用户身份
- login_date 为 DATETIME 格式,但时间始终为 00:00:00
- (以及其他一些与问题字段无关的)
我尝试了几个查询,但没有一个能正常工作。
我想为每个客户获取最后三个登录日期,并找到在上次登录 (login3) 和上次登录 (login1) 之前的时间超过 4 天的客户。
“活动”表包含:
我尝试了几个查询,但没有一个能正常工作。
这是一种可以在 PostgreSQL 8.3 及更高版本中使用数组的解决方案。
生成测试数据。改变第二个参数generate_series()
以添加更多活动记录:
create table activity (id serial primary key, user_id integer, login_date timestamp);
insert into activity (user_id, login_date)
select * from
(
select round(random()*10)::integer as user_id, ('2012-01-01'::date + (round(random()*300))* '1 day'::interval) as login_date
from
(select generate_series(1,1000)) foo
) fooger order by login_date;
select * from activity;
查询出想要的数据:
--show last three login dates per user:
select user_id, login[1] as login1, login[2] as login2, login[3] as login3
from
(
select user_id, array_agg(login_date) as login from
(select * from activity order by user_id,login_date desc) foo
group by user_id
) foo;
--shake out those who haven't been visiting frequently enough
select user_id, login[1] as login1, login[2] as login2, login[3] as login3, (login[1] - coalesce(login[3],login[2],login[1]))::interval as diff
from
(
select user_id, array_agg(login_date) as login from
(select * from activity order by user_id,login_date desc) foo
group by user_id
) foo
where login[1] - coalesce(login[3],login[2],login[1]) > '4 days'::interval;
我使用并简化了@Joshua 提供的设置:
CREATE TEMP TABLE activity (id serial primary key, user_id integer
, login_date timestamp);
INSERT INTO activity (user_id, login_date)
SELECT * FROM (
SELECT round(random()*10)::int AS user_id
, ('2012-01-01 0:0'::timestamp + random() * interval '365 days') AS ts
FROM generate_series(1,1000)
) g
ORDER BY ts;
您可以使用自 PostgreSQL 8.4 起可用的窗口函数:
SELECT user_id, login1, login3, (login1 - login3) AS time_span
FROM (
SELECT user_id, login_date
,first_value(login_date) OVER w AS login1
,COALESCE(lead(login_date, 2) OVER w
,lead(login_date) OVER w) AS login3
FROM activity
WINDOW w AS (PARTITION BY user_id ORDER BY login_date DESC)
) x
WHERE login_date = login1
AND (login1 - login3) > interval '4d';
阅读 IMO 更容易,但在快速测试中,@Joshua 的查询速度快了约 30% 。
除此之外,如果您的时间戳的时间部分始终是00:00:00
您可能需要考虑使用date
列而不是timestamp
.
为了完整性:NAIVE 版本(查询计划显示 3 个 CTE 的三个单独的子计划;这很糟糕)(递归 CTE 也应该是可能的 ;-)
WITH l3 AS (
SELECT a3.id, a3.user_id, a3.login_date
FROM activity a3
WHERE NOT EXISTS ( SELECT *
FROM activity nx
WHERE nx.user_id = a3.user_id
AND nx.login_date > a3.login_date
)
)
, l2 AS (
SELECT a2.id, a2.user_id, a2.login_date
FROM activity a2
JOIN l3 ON l3.user_id = a2.user_id AND l3.login_date > a2.login_date
WHERE NOT EXISTS ( SELECT *
FROM activity nx
WHERE nx.user_id = a2.user_id
AND nx.login_date > a2.login_date
AND nx.login_date < l3.login_date
)
)
, l1 AS (
SELECT a1.id, a1.user_id, a1.login_date
FROM activity a1
JOIN l2 ON l2.user_id = a1.user_id AND l2.login_date > a1.login_date
WHERE NOT EXISTS ( SELECT *
FROM activity nx
WHERE nx.user_id = a1.user_id
AND nx.login_date > a1.login_date
AND nx.login_date < l2.login_date
)
)
SELECT l1.user_id
,l1.id AS ii1, l1.login_date AS d1
,l2.id AS ii2, l2.login_date AS d2
,l3.id AS ii2, l3.login_date AS d3
FROM l1
JOIN l2 ON l2.user_id = l1.user_id
JOIN l3 ON l3.user_id = l1.user_id
WHERE l3.login_date - l1.login_date > '4 days'::INTERVAL
;