1

I am currently applying the following recipe and workflow in order to fit a Random Forest using 5 folds cross validation using fit_resamples. The workflow looks something like this:

library(tidymodels)

# import data and convert response to factor
train <- read.csv('https://pastebin.com/raw/LJQqdEEE')
train$accepted <- as.factor(train$accepted)

# Train/test split
new_split <- initial_split(train, prop = 0.7)
new_train <- training(new_split)
new_test <- testing(new_split)

# Feature engineering and data prep
admission_rec <- 
  recipe(accepted ~ ., data = new_train) %>% 
  step_impute_median(sat) %>% 
  step_mutate(
    ap_scores = strsplit(as.character(ap_scores), ';'),
    ap_score_max = max(as.numeric(unlist(ap_scores))),
    ap_score_avg = mean(as.numeric(unlist(ap_scores))),
    ap_score_min = min(as.numeric(unlist(ap_scores))),
    ap_score_med = median(as.numeric(unlist(ap_scores)))
  ) %>% 
  step_dummy(ethnicity, one_hot = T) %>% 
  step_center(c(essay_strength, family_income, sat), skip = T) %>%
  step_scale(c(essay_strength, family_income, sat), skip = T) %>%
  step_naomit(everything(), skip = T) %>% 
  step_rm(ap_scores)

# Random forest model and workflow
rf_spec <- 
  rand_forest() %>% 
  set_engine('ranger') %>% 
  set_mode('classification')

rf_workflow <- 
  workflow() %>% 
  add_recipe(admission_rec) %>% 
  add_model(rf_spec)

# Cross validation
cv_folds <- 
  vfold_cv(new_train, v = 5)

# Fit model
rf_res <- rf_workflow %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = metric_set(
      recall, precision, f_meas, accuracy,
      kap, roc_auc, sens, spec
    )
  )

When fitting the model I am prompted with the following failure message:

preprocessor 1/1: There are new levels in a factor: NA
preprocessor 1/1, model 1/1 (predictions): Missing data in columns: ethnicity_Asian ...

This looks limited to the one hot encoded columns and even with step_naomit(skip = TRUE). For this reason, I've erroneously thought that placing step_naomit after step_mutate would take care of it.

I am probably overlooking something fairly simple here, this is my first stab at {tidymodels} after a long R hyathus.

4

1 回答 1

1

You are on the right track. step_naomit() is unfortunately not the answer, the error arises in step_dummy() because it contains missing data and it doesn't know how to deal with it. The solution is to use step_unknown() right before, it will take a factor variable and assign "unknown" to missing values.

I would also recommend that you do not set skip = T in step_center() and step_scale() as it would apply centering and scaling when fitting the model but would skip if when the model is used later on, such as in prediction. This would strange and undesired results.

library(tidymodels)
train <- read.csv('https://pastebin.com/raw/LJQqdEEE')
train$accepted <- as.factor(train$accepted)

# Train/test split
new_split <- initial_split(train, prop = 0.7)
new_train <- training(new_split)
new_test <- testing(new_split)

# Feature engineering and data prep
admission_rec <- 
  recipe(accepted ~ ., data = new_train) %>% 
  step_impute_median(sat) %>% 
  step_mutate(
    ap_scores = strsplit(as.character(ap_scores), ';'),
    ap_score_max = max(as.numeric(unlist(ap_scores))),
    ap_score_avg = mean(as.numeric(unlist(ap_scores))),
    ap_score_min = min(as.numeric(unlist(ap_scores))),
    ap_score_med = median(as.numeric(unlist(ap_scores)))
  ) %>%
  step_unknown(ethnicity) %>%
  step_dummy(ethnicity, one_hot = T) %>%
  step_center(c(essay_strength, family_income, sat)) %>%
  step_scale(c(essay_strength, family_income, sat)) %>%
  step_rm(ap_scores)

# Random forest model and workflow
rf_spec <- 
  rand_forest() %>% 
  set_engine('ranger') %>% 
  set_mode('classification')

rf_workflow <- 
  workflow() %>% 
  add_recipe(admission_rec) %>% 
  add_model(rf_spec)

# Cross validation
cv_folds <- 
  vfold_cv(new_train, v = 5)

# Fit model
rf_res <- rf_workflow %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = metric_set(
      recall, precision, f_meas, accuracy,
      kap, roc_auc, sens, spec
    )
  )

rf_res
#> # Resampling results
#> # 5-fold cross-validation 
#> # A tibble: 5 x 4
#>   splits            id    .metrics         .notes          
#>   <list>            <chr> <list>           <list>          
#> 1 <split [560/140]> Fold1 <tibble [8 × 4]> <tibble [0 × 1]>
#> 2 <split [560/140]> Fold2 <tibble [8 × 4]> <tibble [0 × 1]>
#> 3 <split [560/140]> Fold3 <tibble [8 × 4]> <tibble [0 × 1]>
#> 4 <split [560/140]> Fold4 <tibble [8 × 4]> <tibble [0 × 1]>
#> 5 <split [560/140]> Fold5 <tibble [8 × 4]> <tibble [0 × 1]>

Created on 2021-06-22 by the reprex package (v2.0.0)

于 2021-06-22T15:58:09.343 回答