我为此编写了一个函数。它也适用于名义预测变量。它只适用于lm
对象(我认为),但可以很容易地扩展到glm
等。
# from
# http://stackoverflow.com/a/16030020/3980197
# via http://www.statmethods.net/stats/regression.html
#' Calculate k fold cross validated r2
#'
#' Using k fold cross-validation, estimate the true r2 in a new sample. This is better than using adjusted r2 values.
#' @param lmfit (an lm fit) An lm fit object.
#' @param folds (whole number scalar) The number of folds to use (default 10).
#' @export
#' @examples
#' fit = lm("Petal.Length ~ Sepal.Length", data = iris)
#' MOD_k_fold_r2(fit)
MOD_k_fold_r2 = function(lmfit, folds = 10, runs = 100, seed = 1) {
library(magrittr)
#get data
data = lmfit$model
#seed
if (!is.na(seed)) set.seed(seed)
v_runs = sapply(1:runs, FUN = function(run) {
#Randomly shuffle the data
data2 = data[sample(nrow(data)), ]
#Create n equally size folds
folds_idx <- cut(seq(1, nrow(data2)), breaks = folds, labels = FALSE)
#Perform n fold cross validation
sapply(1:folds, function(i) {
#Segement your data by fold using the which() function
test_idx = which(folds_idx==i, arr.ind=TRUE)
test_data = data2[test_idx, ]
train_data = data2[-test_idx, ]
#weights
if ("(weights)" %in% data) {
wtds = train_data[["(weights)"]]
} else {
train_data$.weights = rep(1, nrow(train_data))
}
#fit
fit = lm(formula = lmfit$call$formula, data = train_data, weights = .weights)
#predict
preds = predict(fit, newdata = test_data)
#correlate to get r2
cor(preds, test_data[[1]], use = "p")^2
}) %>%
mean()
})
#return
c("raw_r2" = summary(lmfit)$r.squared, "cv_r2" = mean(v_runs))
}
测试它:
fit = lm("Petal.Length ~ Species", data = iris)
MOD_k_fold_r2(fit)
#> raw_r2 cv_r2
#> 0.9413717 0.9398156
在 OP 样本上:
> MOD_k_fold_r2(lmfit)
#raw_r2 cv_r2
# 0.724 0.718