你可以并行聚合data.table
吗?是的。
这值得么?没有。这是上一个答案未能突出显示的关键点。
正如Matt Dowle在data.table and parallel computing中解释的那样,在并行运行操作时需要在分发之前制作副本(“块”)。这会减慢速度。在某些情况下,当您无法使用data.table
(例如运行许多线性回归)时,值得在内核之间拆分任务。但不是聚合——至少在data.table
涉及时是这样。
简而言之(除非另有证明),聚合使用data.table
并停止担心使用doMC
. data.table
与其他可用的聚合相比,它已经非常快了——即使它不是多核的!
以下是您可以自己运行的一些基准测试,比较使用with和的data.table
内部聚合。结果首先列出。by
foreach
mclapply
#-----------------------------------------------
# TL;DR FINAL RESULTS (Best to Worst)
# 3 replications, N = 10000:
# (1) 0.007 -- data.table using `by`
# (2) 3.548 -- mclapply with rbindlist
# (3) 5.557 -- foreach with rbindlist
# (4) 5.959 -- foreach with .combine = "rbind"
# (5) 14.029 -- lapply
# ----------------------------------------------
library(data.table)
## And used the following to create the dt
N <- 1e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
setkey(dt, "a")
# TEST AGGREGATION WITHOUT PARALLELIZATION ---------------------------
## using data.tables `by` to aggregate
round(rowMeans(replicate(3, system.time({
dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)]
}))), 3)
# [1] 0.007 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
## using `lapply`
round(rowMeans(replicate(3, system.time({
results <- lapply(unique(dt[["a"]]), function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1])]
})
rbindlist(results)
}))), 3)
# [1] 14.029 elapsed for N == 10,000
# USING `mclapply` FORKING ---------------------------------
## use mclapply
round(rowMeans(replicate(3, system.time({
results <- mclapply(unique(dt[["a"]]),
function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}, mc.cores=4)
rbindlist(results)
}))), 3)
# [1] 3.548 elapsed for N == 10,000
# PARALLELIZATION USING `doMC` PACKAGE ---------------------------------
library(doMC)
mc = 4
registerDoMC(cores=mc)
getDoParWorkers()
# [1] 4
## (option a) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}))), 3)
# [1] 5.959 elapsed for N == 10,000
## (option b) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
results <-
foreach(x=unique(dt[["a"]])) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
rbindlist(results)
}))), 3)
# [1] 5.557 elapsed for N == 10,000
registerDoSEQ()
getDoParWorkers()
# [1] 1