r - 如何在具有并行计算的自定义函数中有效地使用 do.call 函数

Question

我有一个旨在：

在两个模型的已知参数集（空和替代）下模拟两个数据集
将两个模型重新拟合到模拟数据

我想通过将并行包与 pblapply 包结合使用来加快计算时间。

这是功能：

simulate.data <- function (tree, null_m, alt_m, nsim = 5, do.parallel = T, optTEXT = NULL){

  ## null_m and alt_m are fitted using mvMORPH function
  library(mvMORPH)
  if (!all (class (null_m)[1] == "mvmorph" & class (alt_m)[1] == "mvmorph")) 
    stop ("Fitted models must be of class 'mvmorph'")

  ## define functions
  transform <- function (x){
    if (is.matrix (x)) {
      res <- vector ("list", ncol (x))
      for (i in 1:ncol (x)){
        res[[i]] <- x[,i]
      }
    }
    else {
      res <- x
    }
    res
  }

  find_fun <- function (x){
    class.temp <- class (x)[2]
    if (class.temp == "mvmorph.bm") return ("mvBM")
    if (class.temp == "mvmorph.ou") return ("mvOU")
    if (class.temp == "mvmorph.shift") return ("mvSHIFT")
  }

  ## take arguments of null and alternative fit
  call.fun.A <- find_fun (null_m)
  argsA <- null_m$param [names (null_m$param) %in% names (as.list (args (call.fun.A)))]
  argsA <- lapply (argsA, function (x) if (length(x)>1) x[1]
                   else x)

  call.fun.B <- find_fun(alt_m)
  argsB <- alt_m$param [names (alt_m$param) %in% names (as.list (args (call.fun.B)))]
  argsB <- lapply (argsB, function (x) if (length(x)>1) x[1]
                   else x)

  ## simulate datasets under null and alternative model
  A.df <- transform (simulate(object = null_m, tree = tree, nsim = nsim))
  B.df <- transform (simulate(object = alt_m, tree = tree, nsim = nsim))

  ## refit null (A) and alternative (B) model to simulated data
  # AA: fit null model to data simulated under null model

  library(pbapply)
  op <- pboptions(type = "timer") # default

  if (do.parallel){

    library(parallel)
    cl <- makeCluster(detectCores()-1)
    clusterEvalQ (cl, library(mvMORPH))
    clusterExport (cl, varlist=c("tree", ## tree
                                 "A.df", "B.df", ## simulated data
                                 "call.fun.A", "call.fun.B", ## values of these objects are names of mvMORPH functions to be called with do.call function
                                 "argsA", "argsB"), envir=environment()) ## 'args' objects specify arguments to be passed to do.call function 
    clusterExport (cl, varlist = "do.call")

    cat (paste0 ("\nfitting models to simulated data under the null model (", argsA$model, ")\n"))

    AA <- pblapply (X = A.df, FUN = function(x)
      do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
    AB <- pblapply (X = A.df, FUN = function(x)
      do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)

    cat (paste0 ("\nfitting models to simulated data under the alternative model (", argsB$model, ")\n"))

    BA <- pblapply (X = B.df, FUN = function(x)
      do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
    BB <- pblapply (X = B.df, FUN = function(x)
      do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)

    stopCluster(cl)

  }

  else {
    cat (paste0 ("\nfitting models to simulated data under the null model (", argsA$model, ")\n"))

    AA <- pblapply (X = A.df, FUN = function(x)
      do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))))
    AB <- pblapply (X = A.df, FUN = function(x)
      do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))))

    cat (paste0 ("\nfitting models to simulated data under the alternative model (", argsB$model, ")\n"))

    BA <- pblapply (X = B.df, FUN = function(x)
      do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))))
    BB <- pblapply (X = B.df, FUN = function(x)
      do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))))

  }

  res <- list (A = null_m, B = alt_m, AA = AA, AB = AB, BA = BA, BB = BB)
  class (res) <- append (class(res),"sim.data")

  if (!is.null(optTEXT)){
    attributes (res) <- c (attributes(res), comment = optTEXT)
    res
  }
  else res

}

此功能有效，但在并行计算过程中似乎存在瓶颈。我怀疑do.call函数引入了冗余，但我不确定......我仍然需要实现 do.call 或其他类似函数，因为我需要在 pblapply 中提供参数列表，并且参数特定于每个拟合。

为了证明并行计算过程中性能不足，我模拟并使用了以下数据：

library (phytools)

Generating a tree with 80 tips
set.seed(789)
tree <- pbtree (n = 80)

# Setting the regime states of tip species
regimes <- as.vector(c(rep("R1",40), rep ("R2", 40)))
names(regimes) <- tree$tip.label
tree <- make.simmap (tree, regimes , model="ER", nsim=1)

# Simulate data
library (mvMORPH)

sigma <- c (R1 = 3, R2 = 0.5)
theta <- 0

# Simulate data under the "BMM" model
data <- mvSIM (tree, nsim = 1, model="BMM", param = list (sigma = sigma, theta = theta))

# Fit models
fit1 <- mvBM (tree = tree, data = data, model = "BMM", method = "sparse")
fit2 <- mvOU (tree = tree, data = data, model = "OUM", method = "pseudoinverse", param = list (maxit = 50000))

## run the function
ss.data <- simulate.data(tree = tree, null_m = fit1, alt_m = fit2, nsim = 100, do.parallel = T)

在具有 i3 CPU 的计算机上，我使用了 3 个 worker 并获得了以下结果：

>fitting models to simulated data under the null model (BMM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 14s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 56s

>fitting models to simulated data under the alternative model (OUM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 51s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 03m 12s

当我运行与上面相同但没有并行计算（do.parallel = F）时，计算通常花费更少的时间：

>fitting models to simulated data under the null model (BMM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 32s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 01m 23s

>fitting models to simulated data under the alternative model (OUM)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 09s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 02m 02s

之后，我只是在全局环境中（不在函数内）运行我的部分函数，但使用并行计算。代码和结果如下：

cl <- makeCluster(detectCores()-1)
clusterEvalQ (cl, library(mvMORPH))
clusterExport (cl, varlist=c("tree", 
                             "A.df", "B.df",
                             "call.fun.A", "call.fun.B", 
                             "argsA", "argsB"), envir=environment())
clusterExport (cl, varlist = "do.call")

>fitting models to simulated data under the null model (BMM)            
AA <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
AB <- pblapply (X = A.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 26s    
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 57s

>fitting models to simulated data under the alternative model (OUM)        
BB <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.B, args = c (list (tree = tree, data = x), c (argsB, diagnostic=FALSE, echo=FALSE))), cl = cl)
BA <- pblapply (X = B.df, FUN = function(x)
do.call (call.fun.A, args = c (list (tree = tree, data = x), c (argsA, diagnostic=FALSE, echo=FALSE))), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 17s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 49s

stopCluster(cl)

请注意，全局环境中的并行计算时间远低于我的自定义函数中的时间......

最后，我只是在全局环境中进行并行计算，但没有 do.call 函数，结果证明它是最有效的：

cl <- makeCluster(detectCores()-1)
clusterEvalQ (cl, library(mvMORPH))
clusterExport (cl, varlist=c("tree", 
                             "A.df", "B.df"), envir=environment())

>fitting models to simulated data under the null model (BMM)
AA <- pblapply (X = A.df, FUN = function(x)
  mvBM (tree = tree, data = x, model = "BMM", method = "sparse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
AB <- pblapply (X = A.df, FUN = function(x)
  mvOU (tree = tree, data = x, model = "OUM", method = "pseudoinverse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 19s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 49s

>fitting models to simulated data under the alternative model (OUM)
BA <- pblapply (X = B.df, FUN = function(x)
  mvBM (tree = tree, data = x, model = "BMM", method = "sparse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
BB <- pblapply (X = B.df, FUN = function(x)
  mvOU (tree = tree, data = x, model = "OUM", method = "pseudoinverse", optimization = "L-BFGS-B", diagnostic=FALSE, echo=FALSE), cl = cl)
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 09s
>|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed = 41s

stopCluster(cl)

我感谢任何可能帮助我在我的函数中实现 do.call 的建议和/或解决方案，并结合并行处理提高性能。

score 0 · Accepted Answer

我发现do.call 函数没有任何问题，但主要问题是缺少 ram 来在我的函数中存储对象。

我在具有 4 GB 内存的计算机上尝试了该功能，并且使用该功能生成的对象很容易到达它。因此，计算机试图将存储在 ram 中的数据分配给 hdd，从而导致该功能的执行速度变慢。一种解决方案是将单个对象提取到具有功能的硬盘驱动器中，save()并按功能将它们从功能环境中删除rm ()。同样，升级 RAM 内存总是合理的。

我两者都做了，并且该功能运行良好。

r - 如何在具有并行计算的自定义函数中有效地使用 do.call 函数

这是功能：

1 回答 1

Related

Reference