There have been several inquiries about doing rolling sums in R, but none have fully addressed the complexity of the problem. I am curious about finding the fastest way to take a rolling sum on a dataset with the following characteristics:
• At least 5,000 rows
• The data span at least 15 years
• The number of groups is a substantial fraction of the total number of rows
• The time series data are incomplete for each group
• There can be multiple observations for any group on any given day
A comprehensive solution to the rolling sum problem must be able to robustly handle each of these characteristics. I have benchmarked 3 different solutions. All solutions calculate a 7 day rolling sum and the results of each solution are identical. As far as I am aware, the plyr::ddply function is the fastest way to take a rolling sum, so I am interested in both optimizing this strategy and any code than can complete such a task faster. My R code and benchmarking results follow:
library(data.table)
library(dplyr)
library(lubridate)
library(zoo)
library(plyr)
library(RcppRoll)
library(microbenchmark)
# Creating dataset
set.seed(8)
df_250_groups=data.table(id=seq.int(5000),
date=sample(seq(as.Date('2000/01/01'), as.Date('2017/01/01'), by="day"),5000,replace=T),
type=sample(seq.int(250),5000,replace=T),
n=1)
# Using plyr::ddply
plyr_ddply=function(data){
if(dim(data)[1] != 0){
data$rolling_7_day_sum = NA
for (i in 1:(dim(data)[1])) {
data$rolling_7_day_sum[i]=sum(data[which(as.numeric(data$date[i]-data$date) %in% 0:6),]$n) # compute rolling sum in for loop
}
}
return(data)
}
# Using RcppRoll::roll_sumr
RcppRoll_roll_sumr=function(df_250_groups) {
data=list()
for(i in 1:length(unique(df_250_groups$type))){
data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
}
data1=rbindlist(data)
aggregated=df_250_groups[,.N,by=c("type","date")]
df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
df2[is.na(N),N:=0]
setorder(df2,date)
df2[,rolling_7_day_sum:=roll_sumr(N,7,fill=NA),by=type] # compute rolling sum in for loop
df3rcp=merge(df_250_groups,df2,by=c("type","date"))
return(df3rcp)
}
# Using zoo::rollsumr
zoo_rollsumr=function(df_250_groups) {
data=list()
for(i in 1:length(unique(df_250_groups$type))){
data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
}
data1=rbindlist(data)
aggregated=df_250_groups[,.N,by=c("type","date")]
df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
df2[is.na(N),N:=0]
setorder(df2,date)
df2[,rolling_7_day_sum:=rollsumr(N,7,sum,fill=NA),by=type] # compute rolling sum in for loop
df3zoo=merge(df_250_groups,df2,by=c("type","date"))
return(df3zoo)
}
mb_250_groups=microbenchmark(
plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F),
RcppRoll_roll_sumr(df_250_groups),
zoo_rollsumr(df_250_groups),
times = 25L,
unit = "s"
)
print(mb_250_groups)
Unit: seconds
expr min lq mean median uq max neval
plyr_ddply_df 1.258333 1.262470 1.279760 1.266951 1.282107 1.352914 25
RcppRoll_roll_sumr(df_250_groups) 1.628959 1.660497 1.714138 1.709180 1.756196 1.866871 25
zoo_rollsumr(df_250_groups) 2.155310 2.193149 2.264942 2.247413 2.269310 2.702394 25
# Executing functions to verify that these rolling functions work correctly
plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F)
RcppRoll_roll_sumr_df=RcppRoll_roll_sumr(df_250_groups)
zoo_rollsumr_df=zoo_rollsumr(df_250_groups)