请原谅我的长问题,但我真的希望有人可以尝试帮助我改进我的代码。基本上这就是我想做的:用不同的输入重复相同的模型(例如随机森林)10 次。作为每次迭代的结果,我想从每个模型中提取几个参数,并且在所有迭代之后从它们产生平均值和标准偏差(例如平均 AUC,平均偏差)。我可能会上传输入文件,但我的问题与不直接依赖它们的步骤有关,我认为它可以使用一些编码来解决。这是一个例子:
我正在使用来自伴随“dismo”包的小插图中的数据处理物种分布模型。所有代码都可以在这里找到:https ://rspatial.org/raster/sdm/6_sdm_methods.html#random-forest 首先我正在创建物种出现(pb=1)和伪缺失(pb=0)的数据)。这些伴随着两列中的经度和纬度坐标,后来的环境变量被连接到每个点。这里一切正常,所以我可以创建一个模型。但我想制作几个模型并平均它们的结果。
这些是我最初的步骤:
require(raster)
#that is my file with occurrence points:
points_herb <- read.csv("herbarium.csv",header=TRUE)
points_herb <- points_herb[,2:3]
points_herb <- SpatialPointsDataFrame(coords = points_herb, data = points_herb, proj4string + CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
> head(points_herb)
lon_x lat_y
1 19.62083 49.62917
2 19.64583 49.62917
3 20.23750 49.61250...
#Variables (I use variables from PCA ran on climate)
files <- list.files("D:/variables/",pattern='asc',full.names=TRUE)
predictors <- raster::stack(files)
> predictors
class : RasterStack
dimensions : 1026, 1401, 1437426, 2 (nrow, ncol, ncell, nlayers)
resolution : 0.008333333, 0.008333333 (x, y)
extent : 16.36667, 28.04167, 42.7, 51.25 (xmin, xmax, ymin, ymax)
crs : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
names : PCA1, PCA2
#Assigning variables to points
presvals <- extract(predictors, points_herb)
reading background points (about 20000):
points_back <- read.csv("back.csv",header=TRUE,dec = ".",sep = ",")
points_back <- points_back[,2:3]
points_back <- SpatialPointsDataFrame(coords = points_back, data = points_back, proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
assigning variables to background/pseudoabsence points
absvals <- extract(predictors, points_back)
absvals <- unique(absvals)
#**this is important!** Sampling 1000 random points from my entire dataset containing ca. 20000
absvals_1 <- absvals[sample(nrow(absvals), 1000), ]
#making an input file for the modeling
pb <- c(rep(1, nrow(presvals)), rep(0, nrow(absvals_1)))
sdmdata1 <- data.frame(cbind(pb, rbind(presvals, absvals_1)))
sdmdata1 <- na.omit(sdmdata1)```
> head(sdmdata1)
pb PCA1 PCA2
1 1 9.985359 2.419048
2 1 8.711462 2.229476
...
我运行模型:
#Random Forest
library(dismo)
library(randomForest)
#rf1- first random forest model
model_rf1 <- pb ~ PCA1 + PCA2
bc <- randomForest(model_rf1, data=sdmdata1)
#the model is predicted over a geographic space
bc_mod <- predict(predictors, bc, progress='')
#let's test it using CalibratR
require(CalibratR)
#extracting model probabilities to presence and absence points (those are actually from a separate dataset)
points_pres1 <- extract(bc_mod, points_pres1, cellnumbers=TRUE)
points_abs1 <- extract(bc_mod, points_abs1, cellnumbers=TRUE)
#prepare those data to test the model
testECE <- c(rep(1, nrow(points_pres1)), rep(0, nrow(points_abs1)))
testECE <- data.frame(cbind(testECE, rbind(points_pres1, points_abs1)))
testECE <- na.omit(testECE)
testECE <- subset(testECE, select = c(testECE, layer))
#make Expected Calibration Error
ECE <- getECE(testECE$testECE, testECE$layer, n_bins = 10)
#make Maximum Calibration Error
MCE <- getMCE(testECE$testECE, testECE$layer, n_bins = 10)
#some other test
require(Metrics)
#get RMSE values
RMSE <- rmse(testECE$testECE, testECE$layer)
random_forest_1 <- data.frame(mget(c('ECE', 'RMSE', 'MCE')))
rownames(random_forest_1) <- "random_forest1"
然后我想运行相同的模型但使用不同的背景点。因此,在这种情况下,我制作了另一个输入文件,其中包含来自整个数据集的另外 1000 个随机点:
absvals_2 <- absvals[sample(nrow(absvals), 1000), ]
pb <- c(rep(1, nrow(presvals_2)), rep(0, nrow(absvals_2)))
sdmdata2 <- data.frame(cbind(pb, rbind(presvals_2, absvals_2)))
sdmdata2 <- na.omit(sdmdata2)
model_rf2 <- pb ~ variable1 + variable2
bc <- randomForest(model_rf2, data=sdmdata2)
bc_mod <- predict(predictors, bc, progress='')
#again, let's test it using CalibratR
points_pres2 <- extract(bc_mod, points_pres2, cellnumbers=TRUE)
points_abs2 <- extract(bc_mod, points_abs2, cellnumbers=TRUE)
# everything just as above, the objects are overwritten
testECE <- c(rep(1, nrow(points_pres2)), rep(0, nrow(points_abs2)))
testECE <- data.frame(cbind(testECE, rbind(points_pres2, points_abs2)))
testECE <- na.omit(testECE)
testECE <- subset(testECE, select = c(testECE, layer))
ECE <- getECE(testECE$testECE, testECE$layer, n_bins = 10)
MCE <- getMCE(testECE$testECE, testECE$layer, n_bins = 10)
RMSE <- rmse(testECE$testECE, testECE$layer)
random_forest_2 <- data.frame(mget(c('ECE', 'RMSE', 'MCE')))
rownames(random_forest_2) <- "random_forest2"
#And finally let's make a mean from ECE, MCE, and RMSE
rf_results <- rbind(random_forest_1, random_forest_2)
rf_results_mean <- sapply(rf_results, 2, FUN=mean)
#and standard deviation
rf_results_sd <- sapply(rf_results, 2, FUN=sd)
result <- rbind(rf_results_mean, rf_results_sd)
在这个例子中,a 只重复了 2 次,但理想情况下我想做 10 次或 100 次。如何使它更优雅和自动,而不是手动创建 100 个对象..?