0

There have been several inquiries about doing rolling sums in R, but none have fully addressed the complexity of the problem. I am curious about finding the fastest way to take a rolling sum on a dataset with the following characteristics:

• At least 5,000 rows

• The data span at least 15 years

• The number of groups is a substantial fraction of the total number of rows

• The time series data are incomplete for each group

• There can be multiple observations for any group on any given day

A comprehensive solution to the rolling sum problem must be able to robustly handle each of these characteristics. I have benchmarked 3 different solutions. All solutions calculate a 7 day rolling sum and the results of each solution are identical. As far as I am aware, the plyr::ddply function is the fastest way to take a rolling sum, so I am interested in both optimizing this strategy and any code than can complete such a task faster. My R code and benchmarking results follow:

library(data.table)
library(dplyr)
library(lubridate)
library(zoo)
library(plyr)
library(RcppRoll)
library(microbenchmark)

# Creating dataset
set.seed(8)
df_250_groups=data.table(id=seq.int(5000),
                         date=sample(seq(as.Date('2000/01/01'), as.Date('2017/01/01'), by="day"),5000,replace=T), 
                         type=sample(seq.int(250),5000,replace=T),
                         n=1)

# Using plyr::ddply
plyr_ddply=function(data){
  if(dim(data)[1] != 0){
    data$rolling_7_day_sum = NA
    for (i in 1:(dim(data)[1])) {
      data$rolling_7_day_sum[i]=sum(data[which(as.numeric(data$date[i]-data$date) %in% 0:6),]$n) # compute rolling sum in for loop
    }
  }
  return(data)
}

# Using RcppRoll::roll_sumr
RcppRoll_roll_sumr=function(df_250_groups) {
  data=list()
  for(i in 1:length(unique(df_250_groups$type))){
    data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
  }
  data1=rbindlist(data)
  aggregated=df_250_groups[,.N,by=c("type","date")]
  df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
  df2[is.na(N),N:=0]
  setorder(df2,date)
  df2[,rolling_7_day_sum:=roll_sumr(N,7,fill=NA),by=type] # compute rolling sum in for loop
  df3rcp=merge(df_250_groups,df2,by=c("type","date"))
  return(df3rcp)
}

# Using zoo::rollsumr
zoo_rollsumr=function(df_250_groups) {
  data=list()
  for(i in 1:length(unique(df_250_groups$type))){
    data[[i]]=list(type=rep(i,6217),date=seq.Date(ymd("2000-01-01")-days(6),ymd("2017-01-01"),by="1 day"))
  }
  data1=rbindlist(data)
  aggregated=df_250_groups[,.N,by=c("type","date")]
  df2=merge(data1,aggregated,by=c("type","date"),all.x=T)
  df2[is.na(N),N:=0]
  setorder(df2,date)
  df2[,rolling_7_day_sum:=rollsumr(N,7,sum,fill=NA),by=type] # compute rolling sum in for loop
  df3zoo=merge(df_250_groups,df2,by=c("type","date"))
  return(df3zoo)
}

mb_250_groups=microbenchmark(
  plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F),
  RcppRoll_roll_sumr(df_250_groups),
  zoo_rollsumr(df_250_groups),
  times = 25L,
  unit = "s"
)
print(mb_250_groups)
Unit: seconds
                              expr      min       lq     mean   median       uq      max neval
                     plyr_ddply_df 1.258333 1.262470 1.279760 1.266951 1.282107 1.352914    25
 RcppRoll_roll_sumr(df_250_groups) 1.628959 1.660497 1.714138 1.709180 1.756196 1.866871    25
       zoo_rollsumr(df_250_groups) 2.155310 2.193149 2.264942 2.247413 2.269310 2.702394    25

# Executing functions to verify that these rolling functions work correctly
plyr_ddply_df=ddply(df_250_groups, .(type), plyr_ddply, .id = F)
RcppRoll_roll_sumr_df=RcppRoll_roll_sumr(df_250_groups)
zoo_rollsumr_df=zoo_rollsumr(df_250_groups)
4

0 回答 0