2018 年 12 月更新
最近在 data.table 中有效地实现了自适应滚动功能 - 更多信息在?froll手册中。此外,已经确定了使用基础 R 的有效替代解决方案(fastama
如下)。不幸的是,Kevin Ushey 的回答没有解决这个问题,因此它不包含在基准测试中。基准的规模已经增加,因为比较微秒毫无意义。
set.seed(108)
x = rnorm(1e6)
width = rep(seq(from = 100, to = 500, by = 5), length.out=length(x))
microbenchmark(
zoo=rollapplyr(x, width = width, FUN=mean, fill=NA),
mapply=base_mapply(x, width=width, FUN=mean, na.rm=T),
wmapply=wmapply(x, width=width, FUN=mean, na.rm=T),
ama=ama(x, width, na.rm=T),
fastama=fastama(x, width),
frollmean=frollmean(x, width, na.rm=T, adaptive=TRUE),
frollmean_exact=frollmean(x, width, na.rm=T, adaptive=TRUE, algo="exact"),
times=1L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zoo 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 32371.938248 1
# mapply 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 13351.726032 1
# wmapply 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 15114.774972 1
# ama 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 9780.239091 1
# fastama 351.618042 351.618042 351.618042 351.618042 351.618042 351.618042 1
# frollmean 7.708054 7.708054 7.708054 7.708054 7.708054 7.708054 1
# frollmean_exact 194.115012 194.115012 194.115012 194.115012 194.115012 194.115012 1
ama = function(x, n, na.rm=FALSE, fill=NA, nf.rm=FALSE) {
# more or less the same as previous forloopply
stopifnot((nx<-length(x))==length(n))
if (nf.rm) x[!is.finite(x)] = NA_real_
ans = rep(NA_real_, nx)
for (i in seq_along(x)) {
ans[i] = if (i >= n[i])
mean(x[(i-n[i]+1):i], na.rm=na.rm)
else as.double(fill)
}
ans
}
fastama = function(x, n, na.rm, fill=NA) {
if (!missing(na.rm)) stop("fast adaptive moving average implemented in R does not handle NAs, input having NAs will result in incorrect answer so not even try to compare to it")
# fast implementation of adaptive moving average in R, in case of NAs incorrect answer
stopifnot((nx<-length(x))==length(n))
cs = cumsum(x)
ans = rep(NA_real_, nx)
for (i in seq_along(cs)) {
ans[i] = if (i == n[i])
cs[i]/n[i]
else if (i > n[i])
(cs[i]-cs[i-n[i]])/n[i]
else as.double(fill)
}
ans
}
老答案:
我选择了 4 个不需要 C++ 的可用解决方案,很容易找到或谷歌。
# 1. rollapply
library(zoo)
?rollapplyr
# 2. mapply
base_mapply <- function(x, width, FUN, ...){
FUN <- match.fun(FUN)
f <- function(i, width, data){
if(i < width) return(NA_real_)
return(FUN(data[(i-(width-1)):i], ...))
}
mapply(FUN = f,
seq_along(x), width,
MoreArgs = list(data = x))
}
# 3. wmapply - modified version of wapply found: https://rmazing.wordpress.com/2013/04/23/wapply-a-faster-but-less-functional-rollapply-for-vector-setups/
wmapply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
SEQ1 <- 1:length(x)
SEQ1[SEQ1 < width] <- NA_integer_
SEQ2 <- lapply(SEQ1, function(i) if(!is.na(i)) (i - (width[i]-1)):i)
OUT <- lapply(SEQ2, function(i) if(!is.null(i)) FUN(x[i], ...) else NA_real_)
return(base:::simplify2array(OUT, higher = TRUE))
}
# 4. forloopply - simple loop solution
forloopply <- function(x, width, FUN = NULL, ...){
FUN <- match.fun(FUN)
OUT <- numeric()
for(i in 1:length(x)) {
if(i < width[i]) next
OUT[i] <- FUN(x[(i-(width[i]-1)):i], ...)
}
return(OUT)
}
以下是prod
功能的时间安排。mean
功能可能已经在内部进行了优化rollapplyr
。所有结果均等。
library(microbenchmark)
# 1a. length(x) = 1000, window = 5-20
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 59.690217 60.694364 61.979876 68.55698 153.60445 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 14.372537 14.694266 14.953234 16.00777 99.82199 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 9.384938 9.755893 9.872079 10.09932 84.82886 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 14.730428 15.062188 15.305059 15.76560 342.44173 100
# 1b. length(x) = 1000, window = 50-200
x <- runif(1000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 71.99894 74.19434 75.44112 86.44893 281.6237 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 15.67158 16.10320 16.39249 17.20346 103.6211 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 10.88882 11.54721 11.75229 12.19790 106.1170 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 15.70704 16.06983 16.40393 17.14210 108.5005 100
# 2a. length(x) = 10000, window = 5-20
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 5, to = 20, by = 5), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 753.87882 781.8789 809.7680 872.8405 1116.7021 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 148.54919 159.9986 231.5387 239.9183 339.7270 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 98.42682 105.2641 117.4923 183.4472 245.4577 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 533.95641 602.0652 646.7420 672.7483 922.3317 100
# 2b. length(x) = 10000, window = 50-200
x <- runif(10000,0.5,1.5)
width <- rep(seq(from = 50, to = 200, by = 50), length(x)/4)
microbenchmark(
rollapplyr(data = x, width = width, FUN = prod, fill = NA),
base_mapply(x = x, width = width, FUN = prod, na.rm=T),
wmapply(x = x, width = width, FUN = prod, na.rm=T),
forloopply(x = x, width = width, FUN = prod, na.rm=T),
times=100L
)
Unit: milliseconds
expr min lq median uq max neval
rollapplyr(data = x, width = width, FUN = prod, fill = NA) 912.5829 946.2971 1024.7245 1071.5599 1431.5289 100
base_mapply(x = x, width = width, FUN = prod, na.rm = T) 171.3189 180.6014 260.8817 269.5672 344.4500 100
wmapply(x = x, width = width, FUN = prod, na.rm = T) 123.1964 131.1663 204.6064 221.1004 484.3636 100
forloopply(x = x, width = width, FUN = prod, na.rm = T) 561.2993 696.5583 800.9197 959.6298 1273.5350 100