0

所以我有一个 162 x 151 的数据集:-

RT (seconds)    76_TI2  114_DECC    120_Lop 212_PCD
38  4.086   1.2 2.322   0
40  2.732   0.815   1.837   1.113
41  4.049   1.153   2.117   2.354
41  4.049   1.153   2.117   3.838
42  4.56    1.224   2.128   2.38
42  2.96    0.909   1.686   0.972
42  3.237   0.96    1.922   1.202
44  2.989   0.8 1.761   2.034

我想使用 10 折交叉验证在其上构建一个随机森林模型,然后查看每个折叠的预测值和实际值。我正在使用 randomForest 包。我做了:-

> set.seed(1500)
> model <- rfcv(x,y, cv.fold=10)

但是我无法找到一种方法来简单地查看从每个折叠中获得的所有预测值以及与之对应的实际值。我该怎么做呢?

谢谢

4

1 回答 1

1

为交叉验证获得的预测值存储在 中model$predicted[[1]],观察值为y。如果要分别查看每个折叠的预测值,则需要获取有关折叠拆分的信息。为此,您可以:

1)手动拆分折叠并自己进行交叉验证

2)使用caret

3)稍微修改cvrf以输出此信息 - 添加idx到输出列表

rfcv2 <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5, 
          mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, 
          ...) 
{
  classRF <- is.factor(trainy)
  n <- nrow(trainx)
  p <- ncol(trainx)
  if (scale == "log") {
    k <- floor(log(p, base = 1/step))
    n.var <- round(p * step^(0:(k - 1)))
    same <- diff(n.var) == 0
    if (any(same)) 
      n.var <- n.var[-which(same)]
    if (!1 %in% n.var) 
      n.var <- c(n.var, 1)
  }
  else {
    n.var <- seq(from = p, to = 1, by = step)
  }
  k <- length(n.var)
  cv.pred <- vector(k, mode = "list")
  for (i in 1:k) cv.pred[[i]] <- trainy
  if (classRF) {
    f <- trainy
  }
  else {
    f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
  }
  nlvl <- table(f)
  idx <- numeric(n)
  for (i in 1:length(nlvl)) {
    idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold, 
                                                length = nlvl[i]))
  }
  for (i in 1:cv.fold) {
    all.rf <- randomForest(trainx[idx != i, , drop = FALSE], 
                           trainy[idx != i], trainx[idx == i, , drop = FALSE], 
                           trainy[idx == i], mtry = mtry(p), importance = TRUE, 
                           ...)
    cv.pred[[1]][idx == i] <- all.rf$test$predicted
    impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]
    for (j in 2:k) {
      imp.idx <- impvar[1:n.var[j]]
      sub.rf <- randomForest(trainx[idx != i, imp.idx, 
                                    drop = FALSE], trainy[idx != i], trainx[idx == 
                                                                              i, imp.idx, drop = FALSE], trainy[idx == i], 
                             mtry = mtry(n.var[j]), importance = recursive, 
                             ...)
      cv.pred[[j]][idx == i] <- sub.rf$test$predicted
      if (recursive) {
        impvar <- (1:length(imp.idx))[order(sub.rf$importance[, 
                                                              1], decreasing = TRUE)]
      }
      NULL
    }
    NULL
  }
  if (classRF) {
    error.cv <- sapply(cv.pred, function(x) mean(trainy != 
                                                   x))
  }
  else {
    error.cv <- sapply(cv.pred, function(x) mean((trainy - 
                                                    x)^2))
  }
  names(error.cv) <- names(cv.pred) <- n.var
  list(n.var = n.var, error.cv = error.cv, predicted = cv.pred, idx = idx)
}

现在你可以打电话

model <- rfcv2(x,y, cv.fold=10)
model$idx  # returns the folds split.

请注意,该cvrf函数不是为纯粹的交叉验证而设计的,而是为变量选择而设计的。因此,您执行了大量冗余计算。

于 2013-12-09T08:52:24.763 回答