要发布在包中获得的可重现结果,mlr
应该使用该set.seed()
函数来控制代码的随机性。
测试,似乎这种做法不会导致预期的结果,其中代码的不同运行会给出稍微不同的输出,例如在这个问题的来源和以下代码中报告。
这是一些可重现的代码
## libraries
library(mlr)
library(parallel)
library(parallelMap)
## options
set.seed(1)
cv.n <- 3
bag.n <- 3
## data
dataset <- data.frame(matrix(rnorm(15000), ncol=5))
dataset$target <- factor(c(rep(0, 1500), rep(1, 1500)))
## task
classif.task <- makeClassifTask(id="dataset", data=dataset, target="target", positive="1")
## resampling strategy
rdesc <- makeResampleDesc("CV", iters=cv.n)
## number of features and observations
nf <- getTaskNFeats(classif.task)
no <- getTaskSize(classif.task)
# training and test set
train.set <- sample(no, size = round(0.8*no))
test.set <- setdiff(seq(no), train.set)
## learners
# decision tree
dt.lrn <- makeLearner("classif.rpart")
dt.lrn <- setPredictType(dt.lrn, predict.type="prob")
# random forest
rf.lrn <- makeLearner("classif.randomForest")
rf.lrn <- setPredictType(rf.lrn, predict.type="prob")
# neural network
nn.lrn <- makeLearner("classif.nnet", MaxNWts=5000, trace=FALSE)
nn.lrn <- makeBaggingWrapper(nn.lrn, bw.iters=bag.n)
nn.lrn <- setPredictType(nn.lrn, predict.type="prob")
# support vector machine
svm.lrn <- makeLearner("classif.svm", kernel="radial")
svm.lrn <- makeBaggingWrapper(svm.lrn, bw.iters=bag.n)
svm.lrn <- setPredictType(svm.lrn, predict.type="prob")
# gradient boosting machine
gbm.lrn <- makeLearner("classif.gbm", distribution="adaboost")
gbm.lrn <- makeBaggingWrapper(gbm.lrn, bw.iters=bag.n)
gbm.lrn <- setPredictType(gbm.lrn, predict.type="prob")
## benchmark
lrns <- list(dt.lrn, rf.lrn, nn.lrn, svm.lrn, gbm.lrn)
parallelStartMulticore(detectCores(), level="mlr.resample")
for (i in 1:5) {
bmrk <- suppressMessages(benchmark(lrns, subsetTask(classif.task, subset=train.set), rdesc, measures=list(mmce, acc, auc, tpr, tnr, ppv, f1)))
print(bmrk)
}
parallelStop()
结果如下:
task.id learner.id mmce.test.mean acc.test.mean auc.test.mean tpr.test.mean tnr.test.mean ppv.test.mean f1.test.mean
1 dataset classif.rpart 0.5050000 0.4950000 0.5035564 0.5857604 0.4216936 0.5001443 0.5132852
2 dataset classif.randomForest 0.5141667 0.4858333 0.4835233 0.4811167 0.4913591 0.4813667 0.4805184
3 dataset classif.nnet.bagged 0.4841667 0.5158333 0.5130378 0.5134648 0.5132865 0.5065907 0.5053251
4 dataset classif.svm.bagged 0.5200000 0.4800000 0.4783791 0.4596055 0.5038720 0.4754137 0.4634932
5 dataset classif.gbm.bagged 0.5175000 0.4825000 0.4999211 0.5681540 0.4307022 NaN 0.4079824
task.id learner.id mmce.test.mean acc.test.mean auc.test.mean tpr.test.mean tnr.test.mean ppv.test.mean f1.test.mean
1 dataset classif.rpart 0.5095833 0.4904167 0.4887304 0.5896894 0.3914619 0.4875778 0.5309759
2 dataset classif.randomForest 0.4920833 0.5079167 0.5094901 0.4981170 0.5180616 0.5037475 0.5005092
3 dataset classif.nnet.bagged 0.5037500 0.4962500 0.5007292 0.5182091 0.4794809 0.4894119 0.4873004
4 dataset classif.svm.bagged 0.4870833 0.5129167 0.5243128 0.4687382 0.5571168 0.5102900 0.4867651
5 dataset classif.gbm.bagged 0.5041667 0.4958333 0.5037020 0.3307626 0.6699108 NaN 0.2177101
task.id learner.id mmce.test.mean acc.test.mean auc.test.mean tpr.test.mean tnr.test.mean ppv.test.mean f1.test.mean
1 dataset classif.rpart 0.5154167 0.4845833 0.4997415 0.6023993 0.3909165 NaN 0.4194502
2 dataset classif.randomForest 0.5058333 0.4941667 0.5092414 0.4792270 0.5101295 0.4898916 0.4837489
3 dataset classif.nnet.bagged 0.4900000 0.5100000 0.5113847 0.6091273 0.4093971 0.5044188 0.5500985
4 dataset classif.svm.bagged 0.5025000 0.4975000 0.5093498 0.4597310 0.5386824 0.4937899 0.4711354
5 dataset classif.gbm.bagged 0.5045833 0.4954167 0.4966777 0.3333333 0.6666667 NaN 0.2181105
task.id learner.id mmce.test.mean acc.test.mean auc.test.mean tpr.test.mean tnr.test.mean ppv.test.mean f1.test.mean
1 dataset classif.rpart 0.5116667 0.4883333 0.4816318 0.40531599 0.5665951 0.4699498 0.42524328
2 dataset classif.randomForest 0.4966667 0.5033333 0.5088898 0.48810185 0.5189027 0.4994051 0.49328978
3 dataset classif.nnet.bagged 0.5229167 0.4770833 0.4914172 0.35887024 0.5964164 0.4683719 0.40099597
4 dataset classif.svm.bagged 0.5016667 0.4983333 0.4926987 0.49163773 0.5047228 0.4935696 0.49220550
5 dataset classif.gbm.bagged 0.5016667 0.4983333 0.4918831 0.04242424 0.9485944 NaN 0.06559572
task.id learner.id mmce.test.mean acc.test.mean auc.test.mean tpr.test.mean tnr.test.mean ppv.test.mean f1.test.mean
1 dataset classif.rpart 0.5016667 0.4983333 0.4928826 0.4982873 0.4894108 0.4845022 0.4687902
2 dataset classif.randomForest 0.5050000 0.4950000 0.4945688 0.4973200 0.4935429 0.4909559 0.4936415
3 dataset classif.nnet.bagged 0.5137500 0.4862500 0.4818591 0.4314678 0.5387981 0.4780440 0.4533055
4 dataset classif.svm.bagged 0.5054167 0.4945833 0.4962764 0.5692158 0.4246046 0.4932935 0.5251290
5 dataset classif.gbm.bagged 0.5129167 0.4870833 0.4852225 0.2393787 0.7450389 NaN 0.1945371
您可以看到不同的运行给出不同的数值输出。这在每个分类器中都是正确的,从最随机的到最小的。
我可以做些什么来确保可重复的结果?