这是我在堆栈溢出上发布的第一个问题。我已尽力使用 reprex r 包创建一个像样的 reprex。任何反馈表示赞赏。开始:
我相信错误位于我的代码的最底部。
我一直在尝试学习如何使用 tidymodels 套件的 r 包,在尝试使用 tune_grid 调整树的 # 和随机森林的 min_n 时遇到了错误。我有点关注这里的博客文章http://www.rebeccabarter.com/blog/2020-03-25_machine_learning/。
据我了解,您可以使用工作流将配方和模型捆绑在一起,然后将其输入到 tune_grid 函数中,并通过某种重新采样(如 cv)来调整超参数。我一定是在某个地方出错了,因为 tune_grid 函数没有成功运行。
这是我的代码:
#install.packages("pacman")
#install.packages("ranger")
pacman::p_load(tidyverse, # all the things
tidymodels, workflows, tune # tidy ml
)
dat <- ggplot2::mpg %>%
mutate(
trans2 = if_else(str_detect(trans, "auto"), "auto", "manual")
) %>%
select(-trans)
head(dat)
#> # A tibble: 6 x 11
#> manufacturer model displ year cyl drv cty hwy fl class trans2
#> <chr> <chr> <dbl> <int> <int> <chr> <int> <int> <chr> <chr> <chr>
#> 1 audi a4 1.8 1999 4 f 18 29 p compact auto
#> 2 audi a4 1.8 1999 4 f 21 29 p compact manual
#> 3 audi a4 2 2008 4 f 20 31 p compact manual
#> 4 audi a4 2 2008 4 f 21 30 p compact auto
#> 5 audi a4 2.8 1999 6 f 16 26 p compact auto
#> 6 audi a4 2.8 1999 6 f 18 26 p compact manual
dat_split <- initial_split(dat, prop = 3/4, strata = trans2)
dat_split
#> <Training/Validation/Total>
#> <176/58/234>
dat_train <- training(dat_split)
dat_cv <- vfold_cv(dat_train, strata = trans2)
dat_cv
#> # 10-fold cross-validation using stratification
#> # A tibble: 10 x 2
#> splits id
#> * <named list> <chr>
#> 1 <split [158/18]> Fold01
#> 2 <split [158/18]> Fold02
#> 3 <split [158/18]> Fold03
#> 4 <split [158/18]> Fold04
#> 5 <split [158/18]> Fold05
#> 6 <split [158/18]> Fold06
#> 7 <split [158/18]> Fold07
#> 8 <split [158/18]> Fold08
#> 9 <split [160/16]> Fold09
#> 10 <split [160/16]> Fold10
dat_recipe <- recipe(trans2 ~ ., data = dat) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal())
dat_recipe
#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 10
#>
#> Operations:
#>
#> Centering and scaling for all_numeric
#> Dummy variables from all_nominal
rf_model <- rand_forest() %>%
set_args(mtry = 4, trees = tune(), min_n = tune()) %>%
set_engine("ranger") %>%
set_mode("classification")
rf_model
#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = 4
#> trees = tune()
#> min_n = tune()
#>
#> Computational engine: ranger
rf_workflow <- workflow() %>%
add_recipe(dat_recipe) %>%
add_model(rf_model)
rf_workflow
#> == Workflow ===========================================================================================================================
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> -- Preprocessor -----------------------------------------------------------------------------------------------------------------------
#> 2 Recipe Steps
#>
#> * step_normalize()
#> * step_dummy()
#>
#> -- Model ------------------------------------------------------------------------------------------------------------------------------
#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = 4
#> trees = tune()
#> min_n = tune()
#>
#> Computational engine: ranger
rf_grid <- rf_model %>%
parameters() %>%
grid_max_entropy(size = 10)
rf_grid
#> # A tibble: 10 x 2
#> trees min_n
#> <int> <int>
#> 1 1014 12
#> 2 737 37
#> 3 339 22
#> 4 1728 2
#> 5 1951 30
#> 6 1673 18
#> 7 9 40
#> 8 966 26
#> 9 345 5
#> 10 1440 39
rf_tune_cv <- rf_workflow %>%
tune_grid(resamples = dat_cv,
grid = rf_grid,
metrics = metric_set(accuracy, roc_auc)
)
#> x Fold01: model 1/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 2/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 3/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 4/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 5/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 6/10: Error: A `parameters` object has required columns.
#> Missing ...
#> x Fold01: model 7/10: Error: A `parameters` object has required columns.
#> Missing ...