1

运行 tidymodels/stacks 后是否可以检索一个、多个或完整堆叠模型的变量重要性?VIP 包尚不支持此功能,但是否有其他方法可以提取该信息?

在这里使用 Simon Couch 的大部分博客,这是我通常尝试尝试的。相反,我将使用随机森林和 SVM 来尝试检索变量重要性。

library(tidyverse)
library(tidymodels)
library(stacks)
library(vip)

wind_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv')

wind <-
  wind_raw %>%
  dplyr::select(
    province_territory, 
    total_project_capacity_mw,
    turbine_rated_capacity_kw = turbine_rated_capacity_k_w,
    rotor_diameter_m,
    hub_height_m,
    year = commissioning_date
  ) %>%
  group_by(province_territory) %>%
  mutate(
    year = as.numeric(year),
    province_territory = case_when(
      n() < 50 ~ "Other",
      TRUE ~ province_territory
    )
  ) %>%
  filter(!is.na(year)) %>%
  ungroup() %>%
  drop_na(turbine_rated_capacity_kw)

# split into training and testing sets
set.seed(1)
wind_split <- initial_split(wind)
wind_train <- training(wind_split)
wind_test  <- testing(wind_split)

# use a 5-fold cross-validation
set.seed(1)
folds <- rsample::vfold_cv(wind_train, v = 5)

# set up a basic recipe
wind_rec <- 
  recipe(turbine_rated_capacity_kw ~ ., data = wind_train) %>%
  step_impute_knn(all_predictors()) %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors())

# define a minimal workflow
wind_wflow <- 
  workflow() %>% 
  add_recipe(wind_rec)

ctrl_res <- control_stack_resamples()

rf_spec <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 1000) %>%
  set_mode('regression') %>%
  set_engine("ranger", importance = "impurity")

# add it to a workflow
rf_wflow <- 
  wind_wflow %>% 
  add_model(rf_spec)

# tune cost and rand_forest and fit to the 5-fold cv
set.seed(1)
rf_res <- 
  tune_grid(
    rf_wflow , 
    resamples = folds, 
    grid = 5,
    control = ctrl_grid
  )

# define a model using parsnip
svm_spec <- 
  svm_rbf(
    cost = tune(), 
    rbf_sigma = tune()
  ) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

# add it to a workflow
svm_wflow <- 
  wind_wflow %>% 
  add_model(svm_spec)

# tune cost and rbf_sigma and fit to the 5-fold cv
set.seed(1)
svm_res <- 
  tune_grid(
    svm_wflow, 
    resamples = folds, 
    grid = 5,
    control = ctrl_grid
  )

# add the models to the stack
wind_data_st <- 
  stacks() %>%
  add_candidates(rf_res) %>%
  add_candidates(svm_res) %>%
  blend_predictions() %>%
  fit_members()

# attempt to plot the variable importance of the stacked model
wind_data_st %>%
  vip()

我返回Error: Model-specific variable importance scores are currently not available for this type of model.,这是不言自明的,但是有没有办法提取这些信息?也许在贵宾之外?是否有可能挑选出一个可行的模型进入堆栈进行评估?有谁知道VIP是否计划解决这个问题?提前致谢!

4

0 回答 0