使用 dplyr 处理这些数据要快得多:
library(dplyr)
system.time({
data %>%
group_by(groupname, starttime, fPhase, fCycle) %>%
summarise_each(funs(median(., na.rm = TRUE)), inadist:larct)
})
#> user system elapsed
#> 0.391 0.004 0.395
(你需要 dplyr 0.2 来获取%>%
和summarise_each
)
这对 plyr 比较有利:
library(plyr)
system.time({
df.median <- ddply(data, .(groupname, starttime, fPhase, fCycle),
numcolwise(median), na.rm = TRUE)
})
#> user system elapsed
#> 0.991 0.004 0.996
以及aggregate()
(来自@joshua-ulrich 的代码)
groupVars <- c("groupname", "starttime", "fPhase", "fCycle")
dataVars <- colnames(data)[ !(colnames(data) %in% c("location", groupVars))]
system.time({
ag.median <- aggregate(data[,dataVars], data[,groupVars], median)
})
#> user system elapsed
#> 0.532 0.005 0.537