对于我正在运行的分析,感兴趣的重要统计数据是存在于多个组中的每个组中的两个子组之间的百分比差异。
我想引导百分比差异,虽然第一次刺是一个总共需要 45 分钟的 for 循环(在 10 组中每次治疗大约有 10 次数千次观察),但我认为 purrr:map 会更快。事实证明不是,我想知道我做错了什么,或者我是否应该期待这种行为。
我可以在其上重现的最接近的内置数据集是warpbreak,在不同张力(组)下两根纱线(子组)之间的断数。请参阅下面的示例。虽然我对计时统计数据并不严格,但我看到循环中的速度与地图相比提高了大约 100 倍。
library(tidyverse)
library(tictoc)
library(ggplot2)
library(boot)
set.seed(1337)
test_data <- warpbreaks
test_data %>%
ggplot() +
geom_boxplot(aes(x=tension, y=breaks, color = wool))
# bootstrap the relative (percent) difference
# in this case Wool A vs. Wool B
reldiff <- function(d, i) {
m1 <- d[i,] %>%
filter(wool == "B") %>%
select(breaks) %>%
unlist() %>%
unname() %>%
mean()
m0 <- d[i,] %>%
filter(wool == "A") %>%
select(breaks) %>%
unlist() %>%
unname() %>%
mean()
(m1-m0)/m0
}
big_nest <- test_data %>%
group_by(tension) %>%
nest()
n_repetitions = 1000
tic()
test_boot <- big_nest %>%
dplyr::mutate(booted = purrr::map(.x = data, # The list-column containing <S3: tibble>
~ boot::boot(data = .x, # The <S3 tibble> column being sampled
statistic = reldiff, # The user-defined function
R = n_repetitions, # The number of replicates
stype = "i")$t %>% quantile(c(0.025, 0.975))))
toc(log=TRUE)
test_boot$booted
# [[1]]
# 2.5% 97.5%
# -0.54953761 -0.09286544
#
# [[2]]
# 2.5% 97.5%
# -0.1314650 0.6325054
#
# [[3]]
# 2.5% 97.5%
# -0.4379570 0.0589989
### for loop version
reldiff_loop <- function(b, a){
(mean(b)-mean(a))/mean(a)
}
bootstrapper <- function(df, field = "breaks", stratum, filter, grouper, n_times = 1000){
subset = df[which(pull(df, stratum) == filter), ]
b_vec = pull(subset, field)[which(pull(subset, grouper)=="B")]
a_vec = pull(subset, field)[which(pull(subset, grouper)=="A")]
theta = reldiff_loop(b_vec, a_vec)
thetahat = rep(0,times = n_times)
for (j in 1:n_times){
thetahat[j] = reldiff_loop(mean(sample(b_vec,replace = TRUE)), mean(sample(a_vec,replace =TRUE)))
}
bounds = quantile(theta-thetahat, probs = c(0.025, 0.975)) %>% unname()
thL = theta - bounds[2]
thU = theta - bounds[1]
# print(paste('The 95% BBootstrap CI for ',i,'is:',thL,thU))
return(paste0(round(c(thL, thU),4), collapse = ", "))
}
tic()
z <- NULL
for(i in c("L", "M", "H")){
z <- rbind(z, bootstrapper(test_data,
field = "breaks",
stratum = "tension",
filter = i,
grouper = "wool",
n_times = n_repetitions))
}
toc(log=T)
z
# [,1]
# [1,] "-0.5469, -0.0952"
# [2,] "-0.1181, 0.6155"
# [3,] "-0.4225, 0.0403"
# ~100x faster on one sample
data.frame(trial = c("purrr", "for-loop"), time = unlist(tic.log()))
# trial time
# purrr 20.18 sec elapsed
# for-loop 0.2 sec elapsed
接受任何建议。或者也许for循环是要走的路