我尝试使用手动创建的引导程序构建黄土拟合的置信区间。但是,我得到的置信区间比默认情况下返回的一个 ggplot 窄得多。有人知道不匹配的原因吗?
这是我的data.frame:
structure(list(year = c(2008, 2008, 2008, 2008, 2008, 2008, 2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011,
2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2012,
2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012,
2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,
2013, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 2014, 2014,
2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2015), week = c(23L, 24L, 26L,
28L, 29L, 31L, 16L, 18L, 20L, 22L, 24L, 26L, 28L, 30L, 32L, 17L,
20L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 39L, 15L, 17L, 19L,
21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 13L, 15L, 18L,
19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 16L, 18L,
20L, 22L, 24L, 26L, 28L, 30L, 32L, 34L, 36L, 38L, 40L, 42L, 44L,
13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L,
39L, 41L, 43L, 13L, 15L, 17L, 19L, 21L, 22L, 23L, 25L, 27L, 29L,
31L, 33L, 35L, 37L, 39L, 41L, 43L), reserves.mean_diff = c(10.1021084736772,
-0.766463989038282, -13.3480825958702, 17.8524842879165, -3.65951527473115,
1.25720024253449, -2.44358947696786, -17.1734197730956, -15.2733472665273,
23.9757799671593, -6.90926973198919, 12.8512968646567, 33.8026439651291,
18.8290534923951, -6.75273931878027, -9.05567300916137, 0.840392801449825,
13.2816315367936, 23.5379000084883, 114.685222001325, 198.219131614654,
45.9449929478138, 40.6682593034992, 36.5758841997304, -5.12350207874786,
2.99091939616023, 9.20093249342791, 13.8286436077293, 3.33161344050982,
2.74206598939135, 4.69395418700713, 37.0653947533576, 32.8195752624717,
15.1376101619812, 21.6067582543964, -12.6387803896504, -8.48222761312096,
-10.7138207854198, -6.81756227706404, -0.54799470071059, 17.4370345102052,
24.5750168199148, 42.3207905314873, 11.3309122402984, 14.3100926377377,
20.5176126940119, -14.6109288289383, -9.20511061567485, 11.0733419545538,
10.8024051420278, -0.270057655436401, -1.46335127110781, -5.14283957701451,
-40.3864151315141, -10.050054938347, 4.99244875943903, -14.7405146096816,
-1.8940167449459, 23.0718835304823, 3.97416790859413, -0.186360431147362,
-29.3893983448893, -33.5619726446489, -5.45050664215525, -4.35281287605294,
-8.66795740561471, 13.0625079675349, 17.3863495323096, -12.5987228607918,
13.0244755244755, 40.8933424230256, 154.135831381733, 176.376963350785,
126.405959031657, 9.35348446683459, -24.0794856808884, -35.2401205032784,
-33.9074690157674, -22.7480817589787, -53.1701860642767, -39.5347211123542,
-33.7617080905671, -23.2400740379139, -28.6645273028933, -39.172670230072,
-29.6969696969697, 65.3609831029186, -5.3627760252366, -8.31635425156209,
NA, -18.1931150293871, -63.7999305314345, -12.6303538175047,
-35.0136818267598, -39.5103036251362, -33.7963505188447, -28.8472127068456,
-14.4943657203656, -17.1046782114241, -12.0518976349798, -5.0786412899815
)), .Names = c("year", "week", "reserves.mean_diff"), class = "data.frame", row.names = c(8L,
233L, 458L, 683L, 908L, 1133L, 1358L, 1583L, 1808L, 2033L, 2258L,
2483L, 2708L, 2933L, 3158L, 3383L, 3608L, 3833L, 4058L, 4283L,
4508L, 4733L, 4958L, 5183L, 5408L, 5633L, 5858L, 6083L, 6308L,
6533L, 6758L, 6983L, 7208L, 7433L, 7658L, 7883L, 8108L, 8333L,
8558L, 8783L, 9008L, 9233L, 9458L, 9683L, 9908L, 10133L, 10358L,
10583L, 10808L, 11033L, 11258L, 11483L, 11708L, 11933L, 12158L,
12383L, 12608L, 12833L, 13058L, 13283L, 13508L, 13733L, 13958L,
14183L, 14408L, 14633L, 14858L, 15083L, 15308L, 15533L, 15758L,
15983L, 16208L, 16433L, 16658L, 16883L, 17108L, 17333L, 17558L,
17783L, 18008L, 18233L, 18458L, 18683L, 18908L, 19133L, 19358L,
19583L, 19808L, 20033L, 20258L, 20483L, 20708L, 20933L, 21158L,
21383L, 21608L, 21833L, 22058L, 22283L, 22508L))
这里是我尝试构建引导程序的脚本:
boot.loess = function(df = my.df, R = 1000,
y.var = "reserves.mean_diff", x.var = "week", x.values = weeks){
all.bootstraps = data.frame()
sample.size = nrow(df)
repeat {
df.sample = df[sample(nrow(df),sample.size, replace = T),]
lo.fit <- loess(get(y.var) ~ get(x.var), data = df.sample, control=loess.control(surface="direct"))
predicted.y = predict(lo.fit,x.values )
one.bootstrap = as.data.frame(t(data.frame(predicted.y)))
names(one.bootstrap) = x.values
all.bootstraps = rbind(all.bootstraps, one.bootstrap)
if(nrow(all.bootstraps)== R+1){
break
}
}
CI.all.weeks = data.frame()
for(i in 1:length(x.values)){
CI.lower_one.week = quantile(all.bootstraps[,i],0.025)
CI.upper_one.week = quantile(all.bootstraps[,i],0.975)
CI.one.week = data.frame(x.values[i],CI.lower_one.week,CI.upper_one.week)
names(CI.one.week)= c("week","CI.lower","CI.upper")
CI.all.weeks = rbind(CI.all.weeks,CI.one.week)
}
return(CI.all.weeks)
}
CI.all.weeks <- boot.loess()
然后我绘制了 ggplot 默认 CI(灰色区域)和自举 CI 的上下边界(作为点):
df.plot = merge(my.df, CI.all.weeks)
ggplot(df.plot,aes(x = week, y = reserves.mean_diff)) + geom_smooth() +
geom_point(aes(x = week, y = CI.lower))+
geom_point(aes(x = week, y = CI.upper))
有人知道这种不匹配的原因吗?我做错了什么?
PS:如果你想知道为什么我手动编写这个,尽管有可用的引导函数。我这样做是为了在下一步中为黄土拟合的一阶导数创建一个 CI。因此,当这可行时,我将对每个拟合进行数值区分,然后平均取 97.5% 和 2.5% 的百分位数来获得黄土拟合的一阶导数的 CI。
编辑:我现在正在使用 Roland 建议的分位数功能。但是我不知道如何使用引导功能。这是我尝试过的:
loess.fit = function(data = df){
lo.fit = loess(get(y.var) ~ get(x.var), data = data, control=loess.control(surface="direct"))
predicted.y = predict(lo.fit,x.values)
one.bootstrap = as.data.frame(t(data.frame(predicted.y)))
return(one.bootstrap)
}
boot(data=df, statistic=loess.fit, R=R)
在 aosmith 注意到它丢失后,我还添加了 return 命令。
正如 aosmith 正确指出的那样,我的函数实际上没有被使用,因为我没有将它分配给任何东西。多么菜鸟的错误。我还将返回函数放在不同的位置。在这里你可以看到它仍然不能很好地匹配它,但是它要好得多,我想可以接受。
谢谢@aosmith