r - Tidymodels，所有模型失败；model.frame.default 中的错误和秩不足拟合的预测可能会产生误导

Question

我在使用 tidymodels-tuning 时遇到问题，它会给出错误和警告：

警告：prediction from a rank-deficient fit may be misleading
错误：Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = ob...

注 1：我正在对正常 CV 折叠和空间 CV 折叠进行调整

注 2：我想包含数据，但 Stack Overflow 给了我：正文限制为 30000 个字符；您输入了 143552。如果您愿意，我可以将数据发送给您！

定义 lm 模型

lm_fit_spatcv <- fit_resamples(
    lm_wf,
    resamples = spatial_cv_fold,
    control = model.control,
    metrics = multi.metric)

定义 glm 模型

glm_fit_spatcv <- fit_resamples(
  glm_wf,
  resamples = spatial_cv_fold,
  control = model.control,
  metrics = multi.metric)

我已经在这里、这里和这里进行了一些研究，但仍然没有真正了解我的预处理步骤可能会导致这些问题。

做了一个reprex

# Loading packages

library(tidyverse)
library(parallelMap)
library(parallelly)
library(parallel) 
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip
library(treesnip)                  
#> Error in library(treesnip): there is no package called 'treesnip'
library(kknn)       
library(kernlab) 
#> 
#> Attaching package: 'kernlab'
#> The following object is masked from 'package:scales':
#> 
#>     alpha
#> The following object is masked from 'package:purrr':
#> 
#>     cross
#> The following object is masked from 'package:ggplot2':
#> 
#>     alpha
library(ranger)
library(datapasta)
library(spatialsample)
library(stacks)

# DATA

# agrofor.biophys.modelling.data <- read.csv(file = here::here("DATA","agrofor.biophys.modelling.data.csv")) 

# Creating sample data
  # agrofor.biophys.modelling.data <- agrofor.biophys.modelling.data %>% 
  #  dplyr::slice_sample(n = 100, replace = FALSE) %>%
  #  as_tibble()

# making a tibble::tribble dataset using dpaste() from the datapasta package

  # datapasta::dpasta(agrofor.biophys.modelling.data)

# Here was a tibble::tribble dataset. I can send you the data if you wish!

# Removing observations with NAs from the data 

ml.data.clean <- data.table::copy(agrofor.biophys.modelling.data) %>%
  drop_na()

ml.data.clean.na.check <- ml.data.clean %>%
  select(everything()) %>%  # replace to your needs
  summarise_all(funs(sum(is.na(.))))
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas: 
#> 
#>   # Simple named list: 
#>   list(mean = mean, median = median)
#> 
#>   # Auto named with `tibble::lst()`: 
#>   tibble::lst(mean, median)
#> 
#>   # Using lambdas
#>   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

# Checking for na

#sapply(ml.data.clean.na.check, function(x) sum(is.na(x)))







# STEP 1: Splitting data, defining resampling techniques and setting global model metrics

## Splitting data in training and testing sets

set.seed(234)

# Splitting data
af.split <- initial_split(ml.data.clean, prop = 0.80, strata = logRR)
#> Warning: The number of observations in each quantile is below the recommended
#> threshold of 20. Stratification will be done with 3 breaks instead.

af.train <- training(af.split)
af.test <- testing(af.split)


## Defining resampling techniques


# Re-sample technique(s)
boostrap_df <- bootstraps(af.train, times = 10, strata = logRR)
#> Warning: The number of observations in each quantile is below the recommended
#> threshold of 20. Stratification will be done with 2 breaks instead.
cv_fold <- vfold_cv(af.train, v = 10, repeats = 10)
spatial_cv_fold <- spatial_clustering_cv(af.train, coords = c("Longitude", "Latitude"), v = 20)


## Setting global metrics


# Metrics
multi.metric <- metric_set(rmse, rsq, ccc, mae)

model.control <- control_stack_grid()


# STEP 2: Model recipes - pre-processing steps

# Linear model - lm recipe
lm_recipe <- 
  recipe(formula = logRR ~ ., data = af.train) %>% 
  update_role(Site.Type, new_role = "predictor") %>%
  update_role(Latitude,
              Longitude,
              Tree,
              new_role = "sample ID") %>% 
  step_novel(Site.Type, -all_outcomes()) %>% 
  step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors(), -all_nominal()) %>%
  step_nzv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_numeric_predictors()) %>%
  step_center(all_numeric_predictors())  %>%
  step_scale(all_numeric_predictors()) 

# Generalised linear model recipe
glm_recipe <- 
  recipe(formula = logRR ~ ., data = af.train) %>% 
  update_role(Site.Type, new_role = "predictor") %>%
  update_role(Latitude,
              Longitude,
              Tree,
              new_role = "sample ID") %>% 
  step_novel(Site.Type, -all_outcomes()) %>% 
  step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors(), -all_nominal()) %>%
  step_nzv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_numeric_predictors()) %>% 
  step_center(all_numeric_predictors())  %>%
  step_scale(all_numeric_predictors()) %>%
  step_lincomb(all_numeric(), -all_outcomes()) 


# STEP 3: Setting model specifications 

lm_model <- linear_reg() %>% 
  set_mode("regression") %>% 
  set_engine("lm") 

glm_model <- linear_reg(
  mode = "regression",
  penalty = 0.1,
  mixture = 0
) %>%
  set_engine("glmnet")


# STEP 4: Defining model workflows

lm_wf <- workflow() %>% 
  add_model(lm_model) %>% 
  add_recipe(lm_recipe)

glm_wf <- workflow() %>% 
  add_model(glm_model) %>% 
  add_recipe(glm_recipe)

# STEP 5: Model (hyper)-parameter tuning

# Initializing parallel processing 
parallelStartSocket(cpus = detectCores()) 
#> Starting parallelization in mode=socket with cpus=8.

##########################################################################
# Spatial k-fold cross validation 
##########################################################################
lm_fit_spatcv <- fit_resamples(
  lm_wf,
  resamples = spatial_cv_fold,
  control = model.control,
  metrics = multi.metric)
#> ! Fold01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold02: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...


glm_fit_spatcv <- fit_resamples(
  glm_wf,
  resamples = spatial_cv_fold,
  control = model.control,
  metrics = multi.metric)
#> x Fold01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold02: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, 
#> Warning: All models failed. See the `.notes` column.

##########################################################################
# Normal/random k-fold cross validation (CV-fold)  
##########################################################################

lm_fit_cv <- fit_resamples(
  lm_wf,
  resamples = cv_fold,
  control = model.control,
  metrics = multi.metric)
#> ! Fold01, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold02, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold03, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...


glm_fit_cv <- fit_resamples(
  glm_wf,
  resamples = cv_fold,
  control = model.control,
  metrics = multi.metric)
#> x Fold01, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold02, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold03, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...

#> Warning: All models failed. See the `.notes` column.

# Stopping parallel session
parallelStop()
#> Stopped parallelization. All cleaned up.

^{由reprex 包（v2.0.1）于 2021-09-03 创建}

score 1 · Accepted Answer

解决了！

.. 浪费时间。我忘了把这些

lm_recipe <- 
  recipe(formula = logRR ~ ., data = af.train) %>%
  update_role(Site.Type, new_role = "predictor") %>%
  update_role(PrName,
              Out.SubInd,
              Out.SubInd.Code,
              Product,
              Latitude,
              Longitude,
              Tree,
              new_role = "sample ID") %>%
  step_novel(Site.Type, -all_outcomes()) %>%
  step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
  step_zv(all_predictors()) %>%
  step_normalize(all_predictors(), -all_nominal()) %>%
  step_nzv(all_numeric(), -all_outcomes()) %>%
  step_corr(all_numeric_predictors()) %>%
  step_center(all_numeric_predictors())  %>%
  step_scale(all_numeric_predictors())

r - Tidymodels，所有模型失败；model.frame.default 中的错误和秩不足拟合的预测可能会产生误导

1 回答 1

Related

Reference