我已经为 TidyTuesday 上最近的咖啡数据集整理了一个数据预处理方法。我的意图是生成一个工作流,然后从那里调整一个超参数。我特别感兴趣的是通过各种update_role()
函数手动声明预测变量和结果,而不是使用公式,因为我对这种类型的变量选择有一些很好的计划(这真是个好主意!)。
prep
下面的示例生成了一个与and配合得很好的配方bake(coffee_test)
。如果我取消选择结果列,它甚至可以工作,例如。coffee_recipe %>% bake(select(coffee_test, -cupper_points))
. 但是,当我运行工作流程时,tune_grid
我会得到如图所示的错误。看起来tune_grid
找不到没有“预测器”角色的变量,即使bake
做得很好。
现在,如果我改为使用公式和step_rm
我不关心的变量以正常方式做事,那么事情大多会起作用——我会收到一些关于缺少country_of_origin
值的行的警告,我觉得这很奇怪,因为我应该插补那些。我完全有可能误解了角色的目的以及如何使用它们。
library(tidyverse)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom 0.7.0 ✓ recipes 0.1.13
#> ✓ dials 0.0.8 ✓ rsample 0.0.7
#> ✓ infer 0.5.3 ✓ tune 0.1.1
#> ✓ modeldata 0.0.2 ✓ workflows 0.1.2
#> ✓ parsnip 0.1.2 ✓ yardstick 0.0.7
#> ── Conflicts ──────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter() masks stats::filter()
#> x recipes::fixed() masks stringr::fixed()
#> x dplyr::lag() masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step() masks stats::step()
set.seed(12345)
coffee <- tidytuesdayR::tt_load(2020, week = 28)$coffee_ratings
#> --- Compiling #TidyTuesday Information for 2020-07-07 ----
#> --- There is 1 file available ---
#> --- Starting Download ---
#>
#> Downloading file 1 of 1: `coffee_ratings.csv`
#> --- Download complete ---
colnames(coffee)
#> [1] "total_cup_points" "species" "owner"
#> [4] "country_of_origin" "farm_name" "lot_number"
#> [7] "mill" "ico_number" "company"
#> [10] "altitude" "region" "producer"
#> [13] "number_of_bags" "bag_weight" "in_country_partner"
#> [16] "harvest_year" "grading_date" "owner_1"
#> [19] "variety" "processing_method" "aroma"
#> [22] "flavor" "aftertaste" "acidity"
#> [25] "body" "balance" "uniformity"
#> [28] "clean_cup" "sweetness" "cupper_points"
#> [31] "moisture" "category_one_defects" "quakers"
#> [34] "color" "category_two_defects" "expiration"
#> [37] "certification_body" "certification_address" "certification_contact"
#> [40] "unit_of_measurement" "altitude_low_meters" "altitude_high_meters"
#> [43] "altitude_mean_meters"
coffee_split <- initial_split(coffee, prop = 0.8)
coffee_train <- training(coffee_split)
coffee_test <- testing(coffee_split)
coffee_recipe <- recipe(coffee_train) %>%
update_role(cupper_points, new_role = "outcome") %>%
update_role(
variety, processing_method, country_of_origin,
aroma, flavor, aftertaste, acidity, sweetness, altitude_mean_meters,
new_role = "predictor"
) %>%
step_string2factor(all_nominal(), -all_outcomes()) %>%
step_knnimpute(
country_of_origin, altitude_mean_meters,
impute_with = imp_vars(
in_country_partner, company, region, farm_name, certification_body
)
) %>%
step_unknown(variety, processing_method, new_level = "Unknown") %>%
step_other(country_of_origin, threshold = 0.01) %>%
step_other(processing_method, threshold = 0.10) %>%
step_other(variety, threshold = 0.10)
coffee_recipe
#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 9
#>
#> 33 variables with undeclared roles
#>
#> Operations:
#>
#> Factor variables from all_nominal(), -all_outcomes()
#> K-nearest neighbor imputation for country_of_origin, altitude_mean_meters
#> Unknown factor level assignment for variety, processing_method
#> Collapsing factor levels for country_of_origin
#> Collapsing factor levels for processing_method
#> Collapsing factor levels for variety
# This works just fine
coffee_recipe %>%
prep(coffee_train) %>%
bake(select(coffee_test, -cupper_points)) %>%
head()
#> # A tibble: 6 x 42
#> total_cup_points species owner country_of_orig… farm_name lot_number mill
#> <dbl> <fct> <fct> <fct> <fct> <fct> <fct>
#> 1 90.6 Arabica meta… Ethiopia metad plc <NA> meta…
#> 2 87.9 Arabica cqi … other <NA> <NA> <NA>
#> 3 87.9 Arabica grou… United States (… <NA> <NA> <NA>
#> 4 87.3 Arabica ethi… Ethiopia <NA> <NA> <NA>
#> 5 87.2 Arabica cqi … other <NA> <NA> <NA>
#> 6 86.9 Arabica ethi… Ethiopia <NA> <NA> <NA>
#> # … with 35 more variables: ico_number <fct>, company <fct>, altitude <fct>,
#> # region <fct>, producer <fct>, number_of_bags <dbl>, bag_weight <fct>,
#> # in_country_partner <fct>, harvest_year <fct>, grading_date <fct>,
#> # owner_1 <fct>, variety <fct>, processing_method <fct>, aroma <dbl>,
#> # flavor <dbl>, aftertaste <dbl>, acidity <dbl>, body <dbl>, balance <dbl>,
#> # uniformity <dbl>, clean_cup <dbl>, sweetness <dbl>, moisture <dbl>,
#> # category_one_defects <dbl>, quakers <dbl>, color <fct>,
#> # category_two_defects <dbl>, expiration <fct>, certification_body <fct>,
#> # certification_address <fct>, certification_contact <fct>,
#> # unit_of_measurement <fct>, altitude_low_meters <dbl>,
#> # altitude_high_meters <dbl>, altitude_mean_meters <dbl>
# Now let's try putting it into a workflow and running tune_grid
coffee_model <- rand_forest(trees = 500, mtry = tune()) %>%
set_engine("ranger") %>%
set_mode("regression")
coffee_model
#> Random Forest Model Specification (regression)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = 500
#>
#> Computational engine: ranger
coffee_workflow <- workflow() %>%
add_recipe(coffee_recipe) %>%
add_model(coffee_model)
coffee_workflow
#> ══ Workflow ═══════════════════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> ── Preprocessor ───────────────────────────────────────────────────────────────────────────────
#> 6 Recipe Steps
#>
#> ● step_string2factor()
#> ● step_knnimpute()
#> ● step_unknown()
#> ● step_other()
#> ● step_other()
#> ● step_other()
#>
#> ── Model ──────────────────────────────────────────────────────────────────────────────────────
#> Random Forest Model Specification (regression)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = 500
#>
#> Computational engine: ranger
coffee_grid <- expand_grid(mtry = c(2, 5))
coffee_folds <- vfold_cv(coffee_train, v = 5)
coffee_workflow %>%
tune_grid(
resamples = coffee_folds,
grid = coffee_grid
)
#> x Fold1: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold1: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> Warning: All models failed in tune_grid(). See the `.notes` column.
#> Warning: This tuning result has notes. Example notes on model fitting include:
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> # Tuning results
#> # 5-fold cross-validation
#> # A tibble: 5 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [857/215]> Fold1 <NULL> <tibble [2 × 1]>
#> 2 <split [857/215]> Fold2 <NULL> <tibble [2 × 1]>
#> 3 <split [858/214]> Fold3 <NULL> <tibble [2 × 1]>
#> 4 <split [858/214]> Fold4 <NULL> <tibble [2 × 1]>
#> 5 <split [858/214]> Fold5 <NULL> <tibble [2 × 1]>
由reprex 包(v0.3.0)于 2020 年 7 月 21 日创建
会话信息devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.0 (2020-04-24)
#> os Ubuntu 20.04 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language en_AU:en
#> collate en_AU.UTF-8
#> ctype en_AU.UTF-8
#> tz Australia/Melbourne
#> date 2020-07-21
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.1.8 2020-06-17 [1] CRAN (R 4.0.0)
#> blob 1.2.1 2020-01-20 [1] CRAN (R 4.0.0)
#> broom * 0.7.0 2020-07-09 [1] CRAN (R 4.0.0)
#> callr 3.4.3 2020-03-28 [1] CRAN (R 4.0.0)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.0)
#> class 7.3-17 2020-04-26 [4] CRAN (R 4.0.0)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.0)
#> codetools 0.2-16 2018-12-24 [4] CRAN (R 4.0.0)
#> colorspace 1.4-1 2019-03-18 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> curl 4.3 2019-12-02 [1] CRAN (R 4.0.0)
#> DBI 1.1.0 2019-12-15 [1] CRAN (R 4.0.0)
#> dbplyr 1.4.4 2020-05-27 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.0 2020-04-10 [1] CRAN (R 4.0.0)
#> dials * 0.0.8 2020-07-08 [1] CRAN (R 4.0.0)
#> DiceDesign 1.8-1 2019-07-31 [1] CRAN (R 4.0.0)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.0)
#> dplyr * 1.0.0 2020-05-29 [1] CRAN (R 4.0.0)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> forcats * 0.5.0 2020-03-01 [1] CRAN (R 4.0.0)
#> foreach 1.5.0 2020-03-30 [1] CRAN (R 4.0.0)
#> fs 1.4.1 2020-04-04 [1] CRAN (R 4.0.0)
#> furrr 0.1.0 2018-05-16 [1] CRAN (R 4.0.0)
#> future 1.17.0 2020-04-18 [1] CRAN (R 4.0.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.0)
#> ggplot2 * 3.3.2.9000 2020-07-10 [1] Github (tidyverse/ggplot2@a11e098)
#> globals 0.12.5 2019-12-07 [1] CRAN (R 4.0.0)
#> glue 1.4.1 2020-05-13 [1] CRAN (R 4.0.0)
#> gower 0.2.2 2020-06-23 [1] CRAN (R 4.0.0)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.0.0)
#> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.0)
#> hardhat 0.1.4 2020-07-02 [1] CRAN (R 4.0.0)
#> haven 2.2.0 2019-11-08 [1] CRAN (R 4.0.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> httr 1.4.1 2019-08-05 [1] CRAN (R 4.0.0)
#> infer * 0.5.3 2020-07-14 [1] CRAN (R 4.0.0)
#> ipred 0.9-9 2019-04-28 [1] CRAN (R 4.0.0)
#> iterators 1.0.12 2019-07-26 [1] CRAN (R 4.0.0)
#> jsonlite 1.7.0 2020-06-25 [1] CRAN (R 4.0.0)
#> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.0)
#> lattice 0.20-41 2020-04-02 [4] CRAN (R 4.0.0)
#> lava 1.6.7 2020-03-05 [1] CRAN (R 4.0.0)
#> lhs 1.0.2 2020-04-13 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.0.0)
#> lubridate 1.7.8 2020-04-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> MASS 7.3-51.6 2020-04-26 [4] CRAN (R 4.0.0)
#> Matrix 1.2-18 2019-11-27 [4] CRAN (R 4.0.0)
#> memoise 1.1.0.9000 2020-05-09 [1] Github (hadley/memoise@4aefd9f)
#> modeldata * 0.0.2 2020-06-22 [1] CRAN (R 4.0.0)
#> modelr 0.1.6 2020-02-22 [1] CRAN (R 4.0.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.0)
#> nnet 7.3-14 2020-04-26 [4] CRAN (R 4.0.0)
#> parsnip * 0.1.2 2020-07-03 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.0)
#> pkgbuild 1.0.8 2020-05-07 [1] CRAN (R 4.0.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> pROC 1.16.2 2020-03-19 [1] CRAN (R 4.0.0)
#> processx 3.4.3 2020-07-05 [1] CRAN (R 4.0.0)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.0.0)
#> ps 1.3.3 2020-05-08 [1] CRAN (R 4.0.0)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.0)
#> ranger 0.12.1 2020-01-10 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.0)
#> readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.0)
#> readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.0)
#> recipes * 0.1.13 2020-06-23 [1] CRAN (R 4.0.0)
#> remotes 2.1.1 2020-02-15 [1] CRAN (R 4.0.0)
#> reprex 0.3.0 2019-05-16 [1] CRAN (R 4.0.0)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.0)
#> rmarkdown 2.3.2 2020-07-12 [1] Github (rstudio/rmarkdown@ff1b279)
#> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> rsample * 0.0.7 2020-06-04 [1] CRAN (R 4.0.0)
#> rstudioapi 0.11 2020-02-07 [1] CRAN (R 4.0.0)
#> rvest 0.3.5 2019-11-08 [1] CRAN (R 4.0.0)
#> scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.0)
#> selectr 0.4-2 2019-11-20 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> stringi 1.4.6 2020-02-17 [1] CRAN (R 4.0.0)
#> stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> survival 3.1-12 2020-04-10 [4] CRAN (R 4.0.0)
#> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.0)
#> tibble * 3.0.3 2020-07-10 [1] CRAN (R 4.0.0)
#> tidymodels * 0.1.1 2020-07-14 [1] CRAN (R 4.0.0)
#> tidyr * 1.1.0 2020-05-20 [1] CRAN (R 4.0.0)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> tidytuesdayR 1.0.1 2020-07-10 [1] CRAN (R 4.0.0)
#> tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.0)
#> timeDate 3043.102 2018-02-21 [1] CRAN (R 4.0.0)
#> tune * 0.1.1 2020-07-08 [1] CRAN (R 4.0.0)
#> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.0)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 4.0.0)
#> vctrs 0.3.2 2020-07-15 [1] CRAN (R 4.0.0)
#> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.0)
#> workflows * 0.1.2 2020-07-07 [1] CRAN (R 4.0.0)
#> xfun 0.15 2020-06-21 [1] CRAN (R 4.0.0)
#> xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#> yardstick * 0.0.7 2020-07-13 [1] CRAN (R 4.0.0)
#>
#> [1] /home/mdneuzerling/R/x86_64-pc-linux-gnu-library/4.0
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library