5

是否存在使用 data.table 分组计算滚动统计数据的现有习语?

例如,给定以下代码:

DT = data.table(x=rep(c("a","b","c"),each=2), y=c(1,3), v=1:6)
setkey(DT, y)

stat.ror <- DT[,rollapply(v, width=1, by=1, mean, na.rm=TRUE), by=y];

如果还没有,最好的方法是什么?

4

1 回答 1

3

事实上,我现在正试图解决这个问题。这是一个部分解决方案,可用于按单列分组:

编辑:用RcppRoll得到它,我认为:

windowed.average <- function(input.table,
                             window.width = 2,
                             id.cols = names(input.table)[3],
                             index.col = names(input.table)[1],
                             val.col = names(input.table)[2]) {
  require(RcppRoll)

  avg.with.group <- 
    input.table[,roll_mean(get(val.col), n = window.width),by=c(id.cols)]
  avg.index <- 
    input.table[,roll_mean(get(index.col), n = window.width),by=c(id.cols)]$V1

  output.table <- data.table(
    Group = avg.with.group,
    Index = avg.index)

  # rename columns to (sensibly) match inputs
  setnames(output.table, old=colnames(output.table),
           new = c(id.cols,val.col,index.col))

  return(output.table)
}

将通过上述内容的(写得不好的)单元测试:

require(testthat)
require(zoo)
test.datatable <- data.table(Time = rep(seq_len(10), times=2), 
                             Voltage = runif(20), 
                             Channel= rep(seq_len(2),each=10))
test.width <- 8

# first test: single id column
test.avgtable <- data.table(
  test.datatable[,rollapply(Voltage, width = test.width, mean, na.rm=TRUE),
                       by=c("Channel")],
  Time = test.datatable[,rollapply(Time, width = test.width, mean, na.rm=TRUE),
                         by=c("Channel")]$V1)
setnames(test.avgtable,old=names(test.avgtable),
         new=c("Channel","Voltage","Time"))

expect_that(test.avgtable,
            is_identical_to(windowed.average(test.datatable,test.width)))

它的外观:

> test.datatable
    Time     Voltage Channel Class
 1:    1 0.310935570       1     1
 2:    2 0.565257533       1     2
 3:    3 0.577278573       1     1
 4:    4 0.152315111       1     2
 5:    5 0.836052122       1     1
 6:    6 0.655417230       1     2
 7:    7 0.034859642       1     1
 8:    8 0.572040136       1     2
 9:    9 0.268105436       1     1
10:   10 0.126484340       1     2
11:    1 0.139711248       2     1
12:    2 0.336316520       2     2
13:    3 0.413086486       2     1
14:    4 0.304146029       2     2
15:    5 0.399344631       2     1
16:    6 0.581641210       2     2
17:    7 0.183586025       2     1
18:    8 0.009775488       2     2
19:    9 0.449576242       2     1
20:   10 0.938517952       2     2

> test.avgtable
   Channel   Voltage Time
1:       1 0.4630195  4.5
2:       1 0.4576657  5.5
3:       1 0.4028191  6.5
4:       2 0.2959510  4.5
5:       2 0.3346841  5.5
6:       2 0.4099593  6.5

不幸的是,我没有设法使它与多个分组一起工作(如第二部分所示):

对于多个列组看起来不错:

# second test: multiple id columns
# Depends on the first test passing to be meaningful.
test.width <- 4
test.datatable[,Class:= rep(seq_len(2),times=ceiling(nrow(test.datatable)/2))]
# windowed.average(test.datatable,test.width,id.cols=c("Channel","Class"))
test.avgtable <- rbind(windowed.average(test.datatable[Class==1,],test.width),
                       windowed.average(test.datatable[Class==2,],test.width))
# somewhat artificially attaching expected class labels
test.avgtable[,Class:= rep(seq_len(2),times=nrow(test.avgtable)/4,each=2)]
setkey(test.avgtable,Channel)
setcolorder(test.avgtable,c("Channel","Class","Voltage","Time"))

expect_that(test.avgtable,
            is_equivalent_to(windowed.average(test.datatable,test.width,
                                             id.cols=c("Channel","Class"))))
于 2014-05-08T05:33:46.530 回答