这是一个dplyr
解决方案,它将产生问题中指定的所需结果(14 行)。请注意,它会处理重复的日期条目,例如用户 x 的 2013-01-04。
# define a custom function to be used in the dplyr chain
myfunc <- function(x){
with(x, sapply(event_number, function(y)
sum(items_bought[event_number <= event_number[y] & date[y] - date <= 2])))
}
require(dplyr) #install and load into your library
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(.))) %>%
select(-c(items_bought, event_number))
# date user cum_items_bought_3_days
#1 2013-01-01 x 2
#2 2013-01-02 x 3
#3 2013-01-03 x 3
#4 2013-01-04 x 1
#5 2013-01-04 x 2
#6 2013-01-04 x 4
#7 2013-01-05 x 6
#8 2013-01-06 x 7
#9 2013-01-01 y 1
#10 2013-01-02 y 2
#11 2013-01-03 y 2
#12 2013-01-04 y 6
#13 2013-01-05 y 11
#14 2013-01-06 y 12
在我的回答中,我myfunc
在dplyr
链中使用了自定义函数。这是使用do
操作符 from完成的dplyr
。user
自定义函数按组传递子集 df 。然后它sapply
用于传递每个event_number
并计算 的总和items_bought
。链的最后一行dplyr
取消选择不需要的列。
如果您需要更详细的解释,请告诉我。
OP评论后编辑:
如果您需要更大的灵活性来有条件地总结其他列,您可以调整代码如下。我在这里假设其他列的总结方式应与items_bought
. 如果这不正确,请指定您希望如何总结其他列。
我首先在数据中创建两个带有随机数的附加列(我将dput
在我的答案底部发布一个数据):
set.seed(99) # for reproducibility only
df$newCol1 <- sample(0:10, 14, replace=T)
df$newCol2 <- runif(14)
df
# date user items_bought event_number newCol1 newCol2
#1 2013-01-01 x 2 1 6 0.687800094
#2 2013-01-02 x 1 2 1 0.640190769
#3 2013-01-03 x 0 3 7 0.357885360
#4 2013-01-04 x 0 4 10 0.102584999
#5 2013-01-04 x 1 5 5 0.097790922
#6 2013-01-04 x 2 6 10 0.182886256
#7 2013-01-05 x 3 7 7 0.227903474
#8 2013-01-06 x 1 8 3 0.080524150
#9 2013-01-01 y 1 1 3 0.821618422
#10 2013-01-02 y 1 2 1 0.591113977
#11 2013-01-03 y 0 3 6 0.773389019
#12 2013-01-04 y 5 4 5 0.350085977
#13 2013-01-05 y 6 5 2 0.006061323
#14 2013-01-06 y 1 6 7 0.814506223
接下来,您可以修改myfunc
为采用 2 个参数,而不是 1。第一个参数将像以前一样保留子集的 data.frame(由.
dplyr 链内部和x
的函数定义中表示myfunc
),而第二个参数myfunc
将指定列总结(colname
)。
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
然后,myfunc
如果您想有条件地总结几列,您可以使用多次:
df %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
# date user cum_items_bought_3_days newCol1Sums newCol2Sums
#1 2013-01-01 x 2 6 0.6878001
#2 2013-01-02 x 3 7 1.3279909
#3 2013-01-03 x 3 14 1.6858762
#4 2013-01-04 x 1 18 1.1006611
#5 2013-01-04 x 2 23 1.1984520
#6 2013-01-04 x 4 33 1.3813383
#7 2013-01-05 x 6 39 0.9690510
#8 2013-01-06 x 7 35 0.6916898
#9 2013-01-01 y 1 3 0.8216184
#10 2013-01-02 y 2 4 1.4127324
#11 2013-01-03 y 2 10 2.1861214
#12 2013-01-04 y 6 12 1.7145890
#13 2013-01-05 y 11 13 1.1295363
#14 2013-01-06 y 12 14 1.1706535
现在您创建了和列的条件items_bought
总和。您还可以省略 dplyr 链中的任何总和,或添加更多列进行总和。newCol1
newCol2
OP评论后编辑#2:
要计算每个用户购买的不同(唯一)项目的累积总和,您可以定义第二个自定义函数myfunc2
并在 dplyr 链中使用它。此功能也很灵活,myfunc
因此您可以定义要应用该功能的列。
代码将是:
myfunc <- function(x, colname){
with(x, sapply(event_number, function(y)
sum(x[event_number <= event_number[y] & date[y] - date <= 2, colname])))
}
myfunc2 <- function(x, colname){
cumsum(sapply(seq_along(x[[colname]]), function(y)
ifelse(!y == 1 & x[y, colname] %in% x[1:(y-1), colname], 0, 1)))
}
require(dplyr) #install and load into your library
dd %>%
mutate(date = as.Date(as.character(date))) %>%
group_by(user) %>%
do(data.frame(., cum_items_bought_3_days = myfunc(., "items_bought"),
newCol1Sums = myfunc(., "newCol1"),
newCol2Sums = myfunc(., "newCol2"),
distinct_items_bought = myfunc2(., "items_bought"))) %>%
select(-c(items_bought, event_number, newCol1, newCol2))
这是我使用的数据:
dput(df)
structure(list(date = structure(c(1L, 2L, 3L, 4L, 4L, 4L, 5L,
6L, 1L, 2L, 3L, 4L, 5L, 6L), .Label = c("2013-01-01", "2013-01-02",
"2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"), class = "factor"),
user = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c(" x", " y"), class = "factor"),
items_bought = c(2L, 1L, 0L, 0L, 1L, 2L, 3L, 1L, 1L, 1L,
0L, 5L, 6L, 1L), event_number = c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L), newCol1 = c(6L, 1L, 7L,
10L, 5L, 10L, 7L, 3L, 3L, 1L, 6L, 5L, 2L, 7L), newCol2 = c(0.687800094485283,
0.640190769452602, 0.357885359786451, 0.10258499882184, 0.0977909218054265,
0.182886255905032, 0.227903473889455, 0.0805241498164833,
0.821618422167376, 0.591113976901397, 0.773389018839225,
0.350085976999253, 0.00606132275424898, 0.814506222726777
)), .Names = c("date", "user", "items_bought", "event_number",
"newCol1", "newCol2"), row.names = c(NA, -14L), class = "data.frame")