我认为问题的底部是错误的,应该说:
强迫我用剩下的例子作为训练集
似乎提到的辅助功能xgb.cv.mknfold
也不再存在了。注意我的xgboost版本是0.71.2
.
然而,这似乎可以通过对 的小修改来相当直接地实现xgb.cv
,例如:
xgb.cv_new <- function(params = list(), data, nrounds, nfold, label = NULL,
missing = NA, prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, folds_train = NULL,
verbose = TRUE, print_every_n = 1L, early_stopping_rounds = NULL,
maximize = NULL, callbacks = list(), ...) {
check.deprecation(...)
params <- check.booster.params(params, ...)
for (m in metrics) params <- c(params, list(eval_metric = m))
check.custom.obj()
check.custom.eval()
if ((inherits(data, "xgb.DMatrix") && is.null(getinfo(data,
"label"))) || (!inherits(data, "xgb.DMatrix") && is.null(label)))
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
if (!is.null(folds)) {
if (!is.list(folds) || length(folds) < 2)
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds)
}
else {
if (nfold <= 1)
stop("'nfold' must be > 1")
folds <- generate.cv.folds(nfold, nrow(data), stratified,
label, params)
}
params <- c(params, list(silent = 1))
print_every_n <- max(as.integer(print_every_n), 1L)
if (!has.callbacks(callbacks, "cb.print.evaluation") && verbose) {
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n,
showsd = showsd))
}
evaluation_log <- list()
if (!has.callbacks(callbacks, "cb.evaluation.log")) {
callbacks <- add.cb(callbacks, cb.evaluation.log())
}
stop_condition <- FALSE
if (!is.null(early_stopping_rounds) && !has.callbacks(callbacks,
"cb.early.stop")) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
maximize = maximize, verbose = verbose))
}
if (prediction && !has.callbacks(callbacks, "cb.cv.predict")) {
callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
}
cb <- categorize.callbacks(callbacks)
dall <- xgb.get.DMatrix(data, label, missing)
bst_folds <- lapply(seq_along(folds), function(k) {
dtest <- slice(dall, folds[[k]])
if (is.null(folds_train))
dtrain <- slice(dall, unlist(folds[-k]))
else
dtrain <- slice(dall, folds_train[[k]])
handle <- xgb.Booster.handle(params, list(dtrain, dtest))
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain,
test = dtest), index = folds[[k]])
})
rm(dall)
basket <- list()
num_class <- max(as.numeric(NVL(params[["num_class"]], 1)),
1)
num_parallel_tree <- max(as.numeric(NVL(params[["num_parallel_tree"]],
1)), 1)
begin_iteration <- 1
end_iteration <- nrounds
for (iteration in begin_iteration:end_iteration) {
for (f in cb$pre_iter) f()
msg <- lapply(bst_folds, function(fd) {
xgb.iter.update(fd$bst, fd$dtrain, iteration - 1,
obj)
xgb.iter.eval(fd$bst, fd$watchlist, iteration - 1,
feval)
})
msg <- simplify2array(msg)
bst_evaluation <- rowMeans(msg)
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2)
for (f in cb$post_iter) f()
if (stop_condition)
break
}
for (f in cb$finalize) f(finalize = TRUE)
ret <- list(call = match.call(), params = params, callbacks = callbacks,
evaluation_log = evaluation_log, niter = end_iteration,
nfeatures = ncol(data), folds = folds)
ret <- c(ret, basket)
class(ret) <- "xgb.cv.synchronous"
invisible(ret)
}
我刚刚添加了一个可选参数folds_train = NULL
,稍后以这种方式在函数内部使用它(见上文):
if (is.null(folds_train))
dtrain <- slice(dall, unlist(folds[-k]))
else
dtrain <- slice(dall, folds_train[[k]])
然后您可以使用新版本的功能,例如如下:
# save original version
orig <- xgboost::xgb.cv
# devtools::install_github("miraisolutions/godmode")
godmode:::assignAnywhere("xgb.cv", xgb.cv_new)
# now you can use (call) xgb.cv with the additional argument
# once you are done, or may want to switch back to the original version
# (if you restart R you will also be back to the original version):
godmode:::assignAnywhere("xgb.cv", orig)
所以现在您应该能够使用额外的参数调用该函数,为训练数据提供额外的索引。
请注意,我没有时间对此进行测试。