r - 在 Tidymodels 中使用分类成本和自定义成本矩阵进行调整

Question

我正在使用 tidymodels 来构建一个模型，其中假阴性比假阳性更昂贵。因此，我想使用该yardstick::classification_cost指标进行超参数调整，但使用反映这一事实的自定义分类成本矩阵。

在拟合模型后执行此操作非常简单：

library(tidymodels)

# load simulated prediction output
data("two_class_example")

# cost matrix penalizing false negatives
cost_matrix <- tribble(
  ~truth, ~estimate, ~cost,
  "Class1", "Class2",  2,
  "Class2", "Class1",  1
)

# use function on simulated prediction output
classification_cost(
  data = two_class_example,
  truth = truth,
  # target class probability
  Class1, 
  # supply the function with the cost matrix
  costs = cost_matrix)
#> # A tibble: 1 × 3
#>   .metric             .estimator .estimate
#>   <chr>               <chr>          <dbl>
#> 1 classification_cost binary         0.260

^{由reprex 包于 2021-11-01 创建(v2.0.1)}

但是在超参数调整期间使用这个函数是我遇到问题的地方。文档指出，对于设置选项，指标应该包含在自定义函数中。这是我的尝试和由此产生的错误。请注意此包装器如何在评估拟合模型时正常工作，但在尝试用于调整时会引发错误：

library(tidymodels)

# load data
data("two_class_example")
data("two_class_dat")

# create custom metric penalizing false negatives 
classification_cost_penalized <- function(
  data,
  truth,
  class_proba,
  na_rm = TRUE
) {
  
  # cost matrix penalizing false negatives
  cost_matrix <- tribble(
    ~truth, ~estimate, ~cost,
    "Class1", "Class2",  2,
    "Class2", "Class1",  1
  )
  
  classification_cost(
    data = data,
    truth = !! rlang::enquo(truth),
    # supply the function with the class probabilities
    !! rlang::enquo(class_proba), 
    # supply the function with the cost matrix
    costs = cost_matrix,
    na_rm = na_rm
  )
}

# Use `new_numeric_metric()` to formalize this new metric function
classification_cost_penalized <- new_prob_metric(classification_cost_penalized, "minimize")

# test if this works on the simulated estimates
two_class_example %>% 
  classification_cost_penalized(truth = truth, class_prob = Class1)
#> # A tibble: 1 × 3
#>   .metric             .estimator .estimate
#>   <chr>               <chr>          <dbl>
#> 1 classification_cost binary         0.260

# test if this works with hyperparameter tuning

# specify a RF model
my_model <- 
  rand_forest(mtry = tune(), 
              min_n = tune(),
              trees = 500) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

# specify recipe
my_recipe <- recipe(Class ~ A + B, data = two_class_dat)

# bundle to workflow
my_wf <- workflow() %>% 
  add_model(my_model) %>% 
  add_recipe(my_recipe)

# start tuning
tuned_rf <- my_wf %>% 
  # set up tuning grid
  tune_grid(
    resamples = vfold_cv(two_class_dat, 
                         v = 5),
    grid = 5,
    metrics = metric_set(classification_cost_penalized))
#> i Creating pre-processing data to finalize unknown parameter: mtry
#> x Fold1: internal: Error: In metric: `classification_cost_penalized`
#> unused argum...
#> x Fold2: internal: Error: In metric: `classification_cost_penalized`
#> unused argum...
#> x Fold3: internal: Error: In metric: `classification_cost_penalized`
#> unused argum...
#> x Fold4: internal: Error: In metric: `classification_cost_penalized`
#> unused argum...
#> x Fold5: internal: Error: In metric: `classification_cost_penalized`
#> unused argum...
#> Warning: All models failed. See the `.notes` column.

^{由reprex 包于 2021-11-01 创建(v2.0.1)}

取消嵌套注释显示有未使用的参数："internal: Error: In metric: classification_cost_penalized\nunused arguments (estimator = ~prob_estimator, event_level = ~event_level)"但显然该yardstick_event_level()函数，event_level根据本文档应该如何设置，不存在？搜索时没有显示该名称下的功能。

我不知道如何在这里进行。感谢您的时间。

score 3 · Accepted Answer

当您调整现有的标准度量时，使用该metric_tweak()函数要容易得多，它允许您对某些可选参数（如cost）进行硬编码，同时保持其他一切不变。它有点像purrr::partial()，但用于衡量标准。

library(tidymodels)

# load data
data("two_class_example")
data("two_class_dat")

cost_matrix <- tribble(
  ~truth, ~estimate, ~cost,
  "Class1", "Class2",  2,
  "Class2", "Class1",  1
)

classification_cost_penalized <- metric_tweak(
  .name = "classification_cost_penalized",
  .fn = classification_cost,
  costs = cost_matrix
)

# test if this works on the simulated estimates
two_class_example %>% 
  classification_cost_penalized(truth = truth, class_prob = Class1)
#> # A tibble: 1 × 3
#>   .metric                       .estimator .estimate
#>   <chr>                         <chr>          <dbl>
#> 1 classification_cost_penalized binary         0.260

# specify a RF model
my_model <- 
  rand_forest(
    mtry = tune(), 
    min_n = tune(),
    trees = 500
  ) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

# specify recipe
my_recipe <- recipe(Class ~ A + B, data = two_class_dat)

# bundle to workflow
my_wf <- workflow() %>% 
  add_model(my_model) %>% 
  add_recipe(my_recipe)

# start tuning
tuned_rf <- my_wf %>% 
  tune_grid(
    resamples = vfold_cv(two_class_dat, v = 5),
    grid = 5,
    metrics = metric_set(classification_cost_penalized)
  )
#> i Creating pre-processing data to finalize unknown parameter: mtry

collect_metrics(tuned_rf)
#> # A tibble: 5 × 8
#>    mtry min_n .metric              .estimator  mean     n std_err .config       
#>   <int> <int> <chr>                <chr>      <dbl> <int>   <dbl> <chr>         
#> 1     1    35 classification_cost… binary     0.407     5  0.0162 Preprocessor1…
#> 2     1    23 classification_cost… binary     0.403     5  0.0146 Preprocessor1…
#> 3     1    10 classification_cost… binary     0.403     5  0.0137 Preprocessor1…
#> 4     2    27 classification_cost… binary     0.396     5  0.0166 Preprocessor1…
#> 5     2     6 classification_cost… binary     0.401     5  0.0161 Preprocessor1…

^{由reprex 包于 2021-11-03 创建(v2.0.1)}

score 0 · Accepted Answer

对于成本不同的情况，您是否会推荐其他指标或方法？即使提供了假阳性和阴性之间的极端差异，它似乎也没有为评估什么是最好的做很多事情。基于上述代码的示例：

library(tidymodels)

# load data
data("two_class_example")
data("two_class_dat")

cost_matrix_1 <- tribble(
  ~truth, ~estimate, ~cost,
  "Class1", "Class2",  10,
  "Class2", "Class1",  1
)

cost_matrix_2 <- tribble(
  ~truth, ~estimate, ~cost,
  "Class1", "Class2",  1,
  "Class2", "Class1",  10
)

classification_cost_penalized_1 <- metric_tweak(
  .name = "classification_cost_penalized_1",
  .fn = classification_cost,
  costs = cost_matrix_1
)

classification_cost_penalized_2 <- metric_tweak(
  .name = "classification_cost_penalized_2",
  .fn = classification_cost,
  costs = cost_matrix_2
)

# test if this works on the simulated estimates
two_class_example %>% 
  classification_cost_penalized_1(truth = truth, class_prob = Class1)
#> # A tibble: 1 × 3
#>   .metric                       .estimator .estimate
#>   <chr>                         <chr>          <dbl>
#> 1 classification_cost_penalized binary         0.260

two_class_example %>% 
  classification_cost_penalized_2(truth = truth, class_prob = Class1)

# specify a RF model
my_model <- 
  rand_forest(
    mtry = tune(), 
    min_n = tune(),
    trees = 500
  ) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

# specify recipe
my_recipe <- recipe(Class ~ A + B, data = two_class_dat)

# bundle to workflow
my_wf <- workflow() %>% 
  add_model(my_model) %>% 
  add_recipe(my_recipe)

# start tuning
tuned_rf <- my_wf %>% 
  tune_grid(
    resamples = vfold_cv(two_class_dat, v = 5),
    grid = 50,
    metrics = metric_set(classification_cost_penalized_1,classification_cost_penalized_2)
  )
#> i Creating pre-processing data to finalize unknown parameter: mtry

#seems to always be increasing or decreaing and not much differrence even when large differences
collect_metrics(tuned_rf) %>%
  ggplot(aes(x = min_n, y = mean, color = .metric)) +
  geom_line() +
  facet_grid(rows = "mtry")

指标概览的可视化

r - 在 Tidymodels 中使用分类成本和自定义成本矩阵进行调整

2 回答 2

Related

Reference