r - R中的并行计算，实现引导

Question

我目前正在尝试使用 BLB bootstrap 计算模型估计器，并且希望并行进行。当我不并行执行时，我的代码可以正常工作。当我并行计算时的问题是我从每个核心得到的结果都包含 NA 值。我不明白如何获得 NA 值，而 Iris 数据集的值根本不包含 NA。这是我正在使用的代码：

library(doParallel)
library(itertools)

 num_of_cores <- detectCores()
 cl <- makePSOCKcluster(num_of_cores)
 registerDoParallel(cl)

 attach(iris)
 data <- iris
 coeftmp <- data.frame()
 system.time(
 r <- foreach(dat = isplitRows(data, chunks=num_of_cores),
             .combine = cbind) %dopar% {

                 BLBsize = round(nrow(dat)^0.6)
                 for (i in 1:400){
                         set.seed(i)

                         # sampling B(n) data points from the original data set without replacement
                         sample_BOFN <- dat[sample(nrow(dat), size = BLBsize, replace = FALSE), ]

                          # sampling from the subsample with replacment
                         sample_bootstrap <- sample_BOFN[sample(nrow(sample_BOFN), size = nrow(sample_BOFN), replace = TRUE), ]

                         bootstrapModel <- glm(sample_bootstrap$Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, data = sample_bootstrap)
                         coeftmp <- rbind(coeftmp, bootstrapModel$coefficients)

                 }
                 #calculating the estimators of the model with mean
                  colMeans(coeftmp)

         })

score 1 · Accepted Answer

我认为您将不得不通过调试器的几次迭代来解决它。但是你NA从这条线上得到了

bootstrapModel <- glm(sample_bootstrap$Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, data = sample_bootstrap)

我猜你会从你sample_bootstrap的 s 中得到一个奇点，因为奇点会给你一个 NA 系数。但是可能是其他原因导致了这个错误，虽然它肯定来自这行代码......你需要逐步通过调试器来隔离它。

...即，这不是一个完整的答案。但这应该可以让您解决自己的问题：

您可以通过调查看到这一点：

r2 <- foreach(dat = isplitRows(data, chunks=1)) %dopar% {

     BLBsize = round(nrow(dat)^0.6)
     for (i in 1:400){
       set.seed(i)

       # sampling B(n) data points from the original data set without replacement
       sample_BOFN <- dat[sample(nrow(dat), size = BLBsize, replace = FALSE), ]

       # sampling from the subsample with replacment
       sample_bootstrap <- sample_BOFN[sample(nrow(sample_BOFN), size = nrow(sample_BOFN), replace = TRUE), ]

       bootstrapModel <- glm(sample_bootstrap$Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, data = sample_bootstrap)
       coeftmp <- rbind(coeftmp, bootstrapModel$coefficients)

     }
     #calculating the estimators of the model with mean
     # return a list, not just the colMeans -- for debugging purposes
     return(list(coeftmp= coeftmp, result= colMeans(coeftmp)))

   }

   sum(is.na(r2[[1]][[1]])) # no missing coefficients with 1 core

r <- foreach(dat = isplitRows(data, chunks=num_of_cores)) %dopar% {

     BLBsize = round(nrow(dat)^0.6)
     for (i in 1:400){
       set.seed(i)

       # sampling B(n) data points from the original data set without replacement
       sample_BOFN <- dat[sample(nrow(dat), size = BLBsize, replace = FALSE), ]

       # sampling from the subsample with replacment
       sample_bootstrap <- sample_BOFN[sample(nrow(sample_BOFN), size = nrow(sample_BOFN), replace = TRUE), ]

       bootstrapModel <- glm(sample_bootstrap$Petal.Width ~ Petal.Length + Sepal.Length + Sepal.Width, data = sample_bootstrap)
       coeftmp <- rbind(coeftmp, bootstrapModel$coefficients)

     }
     #calculating the estimators of the model with mean
     # return a list, not just the colMeans -- for debugging purposes
     return(list(coeftmp= coeftmp, result= colMeans(coeftmp)))

   }

 # lots of missing values in your coeftmp results.
 lapply(r, function(l) {sum(is.na(l[[1]]))})

r - R中的并行计算，实现引导

1 回答 1

Related

Reference