7

我已经为 TidyTuesday 上最近的咖啡数据集整理了一个数据预处理方法。我的意图是生成一个工作流,然后从那里调整一个超参数。我特别感兴趣的是通过各种update_role()函数手动声明预测变量和结果,而不是使用公式,因为我对这种类型的变量选择有一些很好的计划(这真是个好主意!)。

prep下面的示例生成了一个与and配合得很好的配方bake(coffee_test)。如果我取消选择结果列,它甚至可以工作,例如。coffee_recipe %>% bake(select(coffee_test, -cupper_points)). 但是,当我运行工作流程时,tune_grid我会得到如图所示的错误。看起来tune_grid找不到没有“预测器”角色的变量,即使bake做得很好。

现在,如果我改为使用公式和step_rm我不关心的变量以正常方式做事,那么事情大多会起作用——我会收到一些关于缺少country_of_origin值的行的警告,我觉得这很奇怪,因为我应该插补那些。我完全有可能误解了角色的目的以及如何使用它们。

library(tidyverse)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom     0.7.0      ✓ recipes   0.1.13
#> ✓ dials     0.0.8      ✓ rsample   0.0.7 
#> ✓ infer     0.5.3      ✓ tune      0.1.1 
#> ✓ modeldata 0.0.2      ✓ workflows 0.1.2 
#> ✓ parsnip   0.1.2      ✓ yardstick 0.0.7
#> ── Conflicts ──────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter()   masks stats::filter()
#> x recipes::fixed()  masks stringr::fixed()
#> x dplyr::lag()      masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step()   masks stats::step()

set.seed(12345)

coffee <- tidytuesdayR::tt_load(2020, week = 28)$coffee_ratings
#> --- Compiling #TidyTuesday Information for 2020-07-07 ----
#> --- There is 1 file available ---
#> --- Starting Download ---
#> 
#>  Downloading file 1 of 1: `coffee_ratings.csv`
#> --- Download complete ---
colnames(coffee)
#>  [1] "total_cup_points"      "species"               "owner"                
#>  [4] "country_of_origin"     "farm_name"             "lot_number"           
#>  [7] "mill"                  "ico_number"            "company"              
#> [10] "altitude"              "region"                "producer"             
#> [13] "number_of_bags"        "bag_weight"            "in_country_partner"   
#> [16] "harvest_year"          "grading_date"          "owner_1"              
#> [19] "variety"               "processing_method"     "aroma"                
#> [22] "flavor"                "aftertaste"            "acidity"              
#> [25] "body"                  "balance"               "uniformity"           
#> [28] "clean_cup"             "sweetness"             "cupper_points"        
#> [31] "moisture"              "category_one_defects"  "quakers"              
#> [34] "color"                 "category_two_defects"  "expiration"           
#> [37] "certification_body"    "certification_address" "certification_contact"
#> [40] "unit_of_measurement"   "altitude_low_meters"   "altitude_high_meters" 
#> [43] "altitude_mean_meters"

coffee_split <- initial_split(coffee, prop = 0.8)
coffee_train <- training(coffee_split)
coffee_test <- testing(coffee_split)

coffee_recipe <- recipe(coffee_train) %>%
  update_role(cupper_points, new_role = "outcome") %>%
  update_role(
    variety, processing_method, country_of_origin,
    aroma, flavor, aftertaste, acidity, sweetness, altitude_mean_meters,
    new_role = "predictor"
  ) %>%
  step_string2factor(all_nominal(), -all_outcomes()) %>%
  step_knnimpute(
    country_of_origin, altitude_mean_meters,
    impute_with = imp_vars(
      in_country_partner, company, region, farm_name, certification_body
    )
  ) %>%
  step_unknown(variety, processing_method, new_level = "Unknown") %>%
  step_other(country_of_origin, threshold = 0.01) %>%
  step_other(processing_method, threshold = 0.10) %>%
  step_other(variety, threshold = 0.10)
coffee_recipe
#> Data Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor          9
#> 
#>   33 variables with undeclared roles
#> 
#> Operations:
#> 
#> Factor variables from all_nominal(), -all_outcomes()
#> K-nearest neighbor imputation for country_of_origin, altitude_mean_meters
#> Unknown factor level assignment for variety, processing_method
#> Collapsing factor levels for country_of_origin
#> Collapsing factor levels for processing_method
#> Collapsing factor levels for variety

# This works just fine
coffee_recipe %>%
  prep(coffee_train) %>%
  bake(select(coffee_test, -cupper_points)) %>%
  head()
#> # A tibble: 6 x 42
#>   total_cup_points species owner country_of_orig… farm_name lot_number mill 
#>              <dbl> <fct>   <fct> <fct>            <fct>     <fct>      <fct>
#> 1             90.6 Arabica meta… Ethiopia         metad plc <NA>       meta…
#> 2             87.9 Arabica cqi … other            <NA>      <NA>       <NA> 
#> 3             87.9 Arabica grou… United States (… <NA>      <NA>       <NA> 
#> 4             87.3 Arabica ethi… Ethiopia         <NA>      <NA>       <NA> 
#> 5             87.2 Arabica cqi … other            <NA>      <NA>       <NA> 
#> 6             86.9 Arabica ethi… Ethiopia         <NA>      <NA>       <NA> 
#> # … with 35 more variables: ico_number <fct>, company <fct>, altitude <fct>,
#> #   region <fct>, producer <fct>, number_of_bags <dbl>, bag_weight <fct>,
#> #   in_country_partner <fct>, harvest_year <fct>, grading_date <fct>,
#> #   owner_1 <fct>, variety <fct>, processing_method <fct>, aroma <dbl>,
#> #   flavor <dbl>, aftertaste <dbl>, acidity <dbl>, body <dbl>, balance <dbl>,
#> #   uniformity <dbl>, clean_cup <dbl>, sweetness <dbl>, moisture <dbl>,
#> #   category_one_defects <dbl>, quakers <dbl>, color <fct>,
#> #   category_two_defects <dbl>, expiration <fct>, certification_body <fct>,
#> #   certification_address <fct>, certification_contact <fct>,
#> #   unit_of_measurement <fct>, altitude_low_meters <dbl>,
#> #   altitude_high_meters <dbl>, altitude_mean_meters <dbl>

# Now let's try putting it into a workflow and running tune_grid
coffee_model <- rand_forest(trees = 500, mtry = tune()) %>%
  set_engine("ranger") %>% 
  set_mode("regression")
coffee_model
#> Random Forest Model Specification (regression)
#> 
#> Main Arguments:
#>   mtry = tune()
#>   trees = 500
#> 
#> Computational engine: ranger

coffee_workflow <- workflow() %>% 
  add_recipe(coffee_recipe) %>% 
  add_model(coffee_model)
coffee_workflow
#> ══ Workflow ═══════════════════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ───────────────────────────────────────────────────────────────────────────────
#> 6 Recipe Steps
#> 
#> ● step_string2factor()
#> ● step_knnimpute()
#> ● step_unknown()
#> ● step_other()
#> ● step_other()
#> ● step_other()
#> 
#> ── Model ──────────────────────────────────────────────────────────────────────────────────────
#> Random Forest Model Specification (regression)
#> 
#> Main Arguments:
#>   mtry = tune()
#>   trees = 500
#> 
#> Computational engine: ranger

coffee_grid <- expand_grid(mtry = c(2, 5))
coffee_folds <- vfold_cv(coffee_train, v = 5)

coffee_workflow %>%
  tune_grid(
    resamples = coffee_folds,
    grid = coffee_grid
  )
#> x Fold1: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold1: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> Warning: All models failed in tune_grid(). See the `.notes` column.
#> Warning: This tuning result has notes. Example notes on model fitting include:
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> # Tuning results
#> # 5-fold cross-validation 
#> # A tibble: 5 x 4
#>   splits            id    .metrics .notes          
#>   <list>            <chr> <list>   <list>          
#> 1 <split [857/215]> Fold1 <NULL>   <tibble [2 × 1]>
#> 2 <split [857/215]> Fold2 <NULL>   <tibble [2 × 1]>
#> 3 <split [858/214]> Fold3 <NULL>   <tibble [2 × 1]>
#> 4 <split [858/214]> Fold4 <NULL>   <tibble [2 × 1]>
#> 5 <split [858/214]> Fold5 <NULL>   <tibble [2 × 1]>

reprex 包(v0.3.0)于 2020 年 7 月 21 日创建

会话信息
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 4.0.0 (2020-04-24)
#>  os       Ubuntu 20.04 LTS            
#>  system   x86_64, linux-gnu           
#>  ui       X11                         
#>  language en_AU:en                    
#>  collate  en_AU.UTF-8                 
#>  ctype    en_AU.UTF-8                 
#>  tz       Australia/Melbourne         
#>  date     2020-07-21                  
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package      * version    date       lib source                            
#>  assertthat     0.2.1      2019-03-21 [1] CRAN (R 4.0.0)                    
#>  backports      1.1.8      2020-06-17 [1] CRAN (R 4.0.0)                    
#>  blob           1.2.1      2020-01-20 [1] CRAN (R 4.0.0)                    
#>  broom        * 0.7.0      2020-07-09 [1] CRAN (R 4.0.0)                    
#>  callr          3.4.3      2020-03-28 [1] CRAN (R 4.0.0)                    
#>  cellranger     1.1.0      2016-07-27 [1] CRAN (R 4.0.0)                    
#>  class          7.3-17     2020-04-26 [4] CRAN (R 4.0.0)                    
#>  cli            2.0.2      2020-02-28 [1] CRAN (R 4.0.0)                    
#>  codetools      0.2-16     2018-12-24 [4] CRAN (R 4.0.0)                    
#>  colorspace     1.4-1      2019-03-18 [1] CRAN (R 4.0.0)                    
#>  crayon         1.3.4      2017-09-16 [1] CRAN (R 4.0.0)                    
#>  curl           4.3        2019-12-02 [1] CRAN (R 4.0.0)                    
#>  DBI            1.1.0      2019-12-15 [1] CRAN (R 4.0.0)                    
#>  dbplyr         1.4.4      2020-05-27 [1] CRAN (R 4.0.0)                    
#>  desc           1.2.0      2018-05-01 [1] CRAN (R 4.0.0)                    
#>  devtools       2.3.0      2020-04-10 [1] CRAN (R 4.0.0)                    
#>  dials        * 0.0.8      2020-07-08 [1] CRAN (R 4.0.0)                    
#>  DiceDesign     1.8-1      2019-07-31 [1] CRAN (R 4.0.0)                    
#>  digest         0.6.25     2020-02-23 [1] CRAN (R 4.0.0)                    
#>  dplyr        * 1.0.0      2020-05-29 [1] CRAN (R 4.0.0)                    
#>  ellipsis       0.3.1      2020-05-15 [1] CRAN (R 4.0.0)                    
#>  evaluate       0.14       2019-05-28 [1] CRAN (R 4.0.0)                    
#>  fansi          0.4.1      2020-01-08 [1] CRAN (R 4.0.0)                    
#>  forcats      * 0.5.0      2020-03-01 [1] CRAN (R 4.0.0)                    
#>  foreach        1.5.0      2020-03-30 [1] CRAN (R 4.0.0)                    
#>  fs             1.4.1      2020-04-04 [1] CRAN (R 4.0.0)                    
#>  furrr          0.1.0      2018-05-16 [1] CRAN (R 4.0.0)                    
#>  future         1.17.0     2020-04-18 [1] CRAN (R 4.0.0)                    
#>  generics       0.0.2      2018-11-29 [1] CRAN (R 4.0.0)                    
#>  ggplot2      * 3.3.2.9000 2020-07-10 [1] Github (tidyverse/ggplot2@a11e098)
#>  globals        0.12.5     2019-12-07 [1] CRAN (R 4.0.0)                    
#>  glue           1.4.1      2020-05-13 [1] CRAN (R 4.0.0)                    
#>  gower          0.2.2      2020-06-23 [1] CRAN (R 4.0.0)                    
#>  GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.0.0)                    
#>  gtable         0.3.0      2019-03-25 [1] CRAN (R 4.0.0)                    
#>  hardhat        0.1.4      2020-07-02 [1] CRAN (R 4.0.0)                    
#>  haven          2.2.0      2019-11-08 [1] CRAN (R 4.0.0)                    
#>  highr          0.8        2019-03-20 [1] CRAN (R 4.0.0)                    
#>  hms            0.5.3      2020-01-08 [1] CRAN (R 4.0.0)                    
#>  htmltools      0.5.0      2020-06-16 [1] CRAN (R 4.0.0)                    
#>  httr           1.4.1      2019-08-05 [1] CRAN (R 4.0.0)                    
#>  infer        * 0.5.3      2020-07-14 [1] CRAN (R 4.0.0)                    
#>  ipred          0.9-9      2019-04-28 [1] CRAN (R 4.0.0)                    
#>  iterators      1.0.12     2019-07-26 [1] CRAN (R 4.0.0)                    
#>  jsonlite       1.7.0      2020-06-25 [1] CRAN (R 4.0.0)                    
#>  knitr          1.29       2020-06-23 [1] CRAN (R 4.0.0)                    
#>  lattice        0.20-41    2020-04-02 [4] CRAN (R 4.0.0)                    
#>  lava           1.6.7      2020-03-05 [1] CRAN (R 4.0.0)                    
#>  lhs            1.0.2      2020-04-13 [1] CRAN (R 4.0.0)                    
#>  lifecycle      0.2.0      2020-03-06 [1] CRAN (R 4.0.0)                    
#>  listenv        0.8.0      2019-12-05 [1] CRAN (R 4.0.0)                    
#>  lubridate      1.7.8      2020-04-06 [1] CRAN (R 4.0.0)                    
#>  magrittr       1.5        2014-11-22 [1] CRAN (R 4.0.0)                    
#>  MASS           7.3-51.6   2020-04-26 [4] CRAN (R 4.0.0)                    
#>  Matrix         1.2-18     2019-11-27 [4] CRAN (R 4.0.0)                    
#>  memoise        1.1.0.9000 2020-05-09 [1] Github (hadley/memoise@4aefd9f)   
#>  modeldata    * 0.0.2      2020-06-22 [1] CRAN (R 4.0.0)                    
#>  modelr         0.1.6      2020-02-22 [1] CRAN (R 4.0.0)                    
#>  munsell        0.5.0      2018-06-12 [1] CRAN (R 4.0.0)                    
#>  nnet           7.3-14     2020-04-26 [4] CRAN (R 4.0.0)                    
#>  parsnip      * 0.1.2      2020-07-03 [1] CRAN (R 4.0.0)                    
#>  pillar         1.4.6      2020-07-10 [1] CRAN (R 4.0.0)                    
#>  pkgbuild       1.0.8      2020-05-07 [1] CRAN (R 4.0.0)                    
#>  pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.0.0)                    
#>  pkgload        1.1.0      2020-05-29 [1] CRAN (R 4.0.0)                    
#>  plyr           1.8.6      2020-03-03 [1] CRAN (R 4.0.0)                    
#>  prettyunits    1.1.1      2020-01-24 [1] CRAN (R 4.0.0)                    
#>  pROC           1.16.2     2020-03-19 [1] CRAN (R 4.0.0)                    
#>  processx       3.4.3      2020-07-05 [1] CRAN (R 4.0.0)                    
#>  prodlim        2019.11.13 2019-11-17 [1] CRAN (R 4.0.0)                    
#>  ps             1.3.3      2020-05-08 [1] CRAN (R 4.0.0)                    
#>  purrr        * 0.3.4      2020-04-17 [1] CRAN (R 4.0.0)                    
#>  R6             2.4.1      2019-11-12 [1] CRAN (R 4.0.0)                    
#>  ranger         0.12.1     2020-01-10 [1] CRAN (R 4.0.0)                    
#>  Rcpp           1.0.5      2020-07-06 [1] CRAN (R 4.0.0)                    
#>  readr        * 1.3.1      2018-12-21 [1] CRAN (R 4.0.0)                    
#>  readxl         1.3.1      2019-03-13 [1] CRAN (R 4.0.0)                    
#>  recipes      * 0.1.13     2020-06-23 [1] CRAN (R 4.0.0)                    
#>  remotes        2.1.1      2020-02-15 [1] CRAN (R 4.0.0)                    
#>  reprex         0.3.0      2019-05-16 [1] CRAN (R 4.0.0)                    
#>  rlang          0.4.7      2020-07-09 [1] CRAN (R 4.0.0)                    
#>  rmarkdown      2.3.2      2020-07-12 [1] Github (rstudio/rmarkdown@ff1b279)
#>  rpart          4.1-15     2019-04-12 [4] CRAN (R 4.0.0)                    
#>  rprojroot      1.3-2      2018-01-03 [1] CRAN (R 4.0.0)                    
#>  rsample      * 0.0.7      2020-06-04 [1] CRAN (R 4.0.0)                    
#>  rstudioapi     0.11       2020-02-07 [1] CRAN (R 4.0.0)                    
#>  rvest          0.3.5      2019-11-08 [1] CRAN (R 4.0.0)                    
#>  scales       * 1.1.1      2020-05-11 [1] CRAN (R 4.0.0)                    
#>  selectr        0.4-2      2019-11-20 [1] CRAN (R 4.0.0)                    
#>  sessioninfo    1.1.1      2018-11-05 [1] CRAN (R 4.0.0)                    
#>  stringi        1.4.6      2020-02-17 [1] CRAN (R 4.0.0)                    
#>  stringr      * 1.4.0      2019-02-10 [1] CRAN (R 4.0.0)                    
#>  survival       3.1-12     2020-04-10 [4] CRAN (R 4.0.0)                    
#>  testthat       2.3.2      2020-03-02 [1] CRAN (R 4.0.0)                    
#>  tibble       * 3.0.3      2020-07-10 [1] CRAN (R 4.0.0)                    
#>  tidymodels   * 0.1.1      2020-07-14 [1] CRAN (R 4.0.0)                    
#>  tidyr        * 1.1.0      2020-05-20 [1] CRAN (R 4.0.0)                    
#>  tidyselect     1.1.0      2020-05-11 [1] CRAN (R 4.0.0)                    
#>  tidytuesdayR   1.0.1      2020-07-10 [1] CRAN (R 4.0.0)                    
#>  tidyverse    * 1.3.0      2019-11-21 [1] CRAN (R 4.0.0)                    
#>  timeDate       3043.102   2018-02-21 [1] CRAN (R 4.0.0)                    
#>  tune         * 0.1.1      2020-07-08 [1] CRAN (R 4.0.0)                    
#>  usethis        1.6.1      2020-04-29 [1] CRAN (R 4.0.0)                    
#>  utf8           1.1.4      2018-05-24 [1] CRAN (R 4.0.0)                    
#>  vctrs          0.3.2      2020-07-15 [1] CRAN (R 4.0.0)                    
#>  withr          2.2.0      2020-04-20 [1] CRAN (R 4.0.0)                    
#>  workflows    * 0.1.2      2020-07-07 [1] CRAN (R 4.0.0)                    
#>  xfun           0.15       2020-06-21 [1] CRAN (R 4.0.0)                    
#>  xml2           1.3.2      2020-04-23 [1] CRAN (R 4.0.0)                    
#>  yaml           2.2.1      2020-02-01 [1] CRAN (R 4.0.0)                    
#>  yardstick    * 0.0.7      2020-07-13 [1] CRAN (R 4.0.0)                    
#> 
#> [1] /home/mdneuzerling/R/x86_64-pc-linux-gnu-library/4.0
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library
4

1 回答 1

7

此处出现错误是因为step_string2factor()在调优期间,配方开始尝试处理没有任何角色的变量,例如speciesowner

在挑选结果和预测变量之前,尝试为所有名义变量设置角色。

coffee_recipe <- recipe(coffee_train) %>%
  update_role(all_nominal(), new_role = "id") %>%      ## ADD THIS
  update_role(cupper_points, new_role = "outcome") %>%
  update_role(
    variety, processing_method, country_of_origin,
    aroma, flavor, aftertaste, acidity, sweetness, altitude_mean_meters,
    new_role = "predictor"
  ) %>%
  step_string2factor(all_nominal(), -all_outcomes()) %>%
  step_knnimpute(
    country_of_origin, altitude_mean_meters,
    impute_with = imp_vars(
      in_country_partner, company, region, farm_name, certification_body
    )
  ) %>%
  step_unknown(variety, processing_method, new_level = "Unknown") %>%
  step_other(country_of_origin, threshold = 0.01) %>%
  step_other(processing_method, threshold = 0.10) %>%
  step_other(variety, threshold = 0.10)

在我这样做之后,这大部分运行良好,只有一些无法估算高度。可能很难同时估算这两件事。

于 2020-07-22T00:14:32.723 回答