r - 在 R Tidymodels 中：将单独的 pca（预处理/食谱）应用于不同的数据帧，然后将它们合并以进行训练

Question

我想使用 tidymodels 框架，其中三个不同的数据帧分别使用 pca 进行预处理；然后我希望将三个数据帧中的 PCA 组件放在一起进行一次培训。对于训练，我想使用岭回归应用交叉验证。更重要的是避免 PCA 的数据泄露；因此 PCA 应该在交叉验证过程中实施，而不是之前单独执行。（因此，下面示例代码中的 df_x1、df_x2、df_x3 应该创建它们自己的 PCA 组件，其中这些组件应该是一个最终岭回归预测 y 的预测器。

这是一些示例数据：

set.seed(42)
df_x1 <- tibble(V1=runif(10, 1, 10), V2=runif(10, 1, 10))
df_x2 <- tibble(V3=runif(10, 1, 10), V4=runif(10, 1, 10))
df_x3 <- tibble(V5=runif(10, 1, 10), V6=runif(10, 1, 10))

y <- runif(10, 1, 10)

df_x1 <- cbind(df_x1, y)
df_x2 <- cbind(df_x2, y)
df_x3 <- cbind(df_x3, y)

这是我到目前为止所尝试的。

library(recipes)
library(workflows)
library(parsnip)
library(tune)


df_x1_recipe <-
  recipe(y ~ .,
         data = df_x1) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pca(all_predictors(), threshold = .95)

df_x2_recipe <-
  recipe(y ~ .,
                  data = df_x2) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pca(all_predictors(), threshold = .95)

df_x3_recipe <-
  recipe(y ~ .,
                  data = df_x3) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pca(all_predictors(), threshold = .95)


# Cross-validation: In order to avoid data leakage I reckon the three recpies above should have the same folds.
cv_splits <- vfold_cv(df_x3, v = 2, repeats = 1, strata = NULL) # , ... ,  breaks = 4
cv_splits

# Model
dfx1_x3_model <-
  linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

# Tuning
dfx1_x3_grid <- expand.grid(
  penalty = 10 ^ seq(-3, -1, length = 5),
  mixture = (0:4) / 4
)

control <- control_grid(save_pred = TRUE)


dfx1_x3_tune <- tune_grid(
  c(df_x1_recipe, df_x2_recipe, df_x3_recipe),
  model = df3_model,
  resamples = df3_cv_splits,
  grid = df3_glmn_grid,
  control = ctrl
)

score 1 · Accepted Answer

我不是 100% 确定以下代码可以回答您的问题，但它应该与您描述的过程相对应。我添加了一些应该解释代码的注释。

# packages
library(dplyr)
library(purrr)
library(recipes)
library(parsnip)
library(tune)
library(rsample)

# data
set.seed(42)
df_x1 <- tibble(V1=runif(10, 1, 10), V2=runif(10, 1, 10))
df_x2 <- tibble(V3=runif(10, 1, 10), V4=runif(10, 1, 10))
df_x3 <- tibble(V5=runif(10, 1, 10), V6=runif(10, 1, 10))

y <- runif(10, 1, 10)

# run PCA
my_PCA <- function(data) {
  pca_repice <- recipe(~ ., data = {{data}}) %>% 
    step_center(all_predictors()) %>% 
    step_scale(all_predictors()) %>% 
    step_pca(all_predictors(), threshold = 0.95)

  extract_PC <- juice(prep(pca_repice))
  extract_PC
}

cbind_PCA <- map_dfc(list(df_x1, df_x2, df_x3), my_PCA)
cbind_PCA$y <- y

这是结果 cbinding 应用my_PCA()到df_x1, ...,的结果df_x3

cbind_PCA
#> # A tibble: 10 x 7
#>       PC1    PC2    PC11    PC21   PC12   PC22     y
#>     <dbl>  <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <dbl>
#>  1 -0.442  1.07  -1.04    0.143  -0.834  0.301  7.08
#>  2 -1.13   0.510  0.378  -1.58   -0.671  0.213  9.85
#>  3  0.130 -1.77  -0.497   1.04   -1.42   1.25   7.84
#>  4  0.270  1.33  -1.02    0.339   1.77   0.250  6.10
#>  5  0.290  0.319  2.16   -0.0329 -1.55  -0.649  8.65
#>  6 -0.516 -1.15  -0.439  -0.854   1.64   0.184  2.71
#>  7 -1.20  -0.653  1.52    0.592   1.28   0.138  3.44
#>  8  2.49  -0.237  0.0456  1.24   -0.711 -0.740  8.45
#>  9  0.218  0.331 -0.452  -1.14    0.284 -1.23   7.24
#> 10 -0.116  0.258 -0.644   0.262   0.214  0.278  3.16

现在我只是按照您在问题中定义的代码

# define the new recipe
my_recipe <- recipe(y ~ ., data = cbind_PCA)

# define the model
my_model <- linear_reg(penalty = tune(), mixture = tune()) %>% 
  set_engine("glmnet")

# define the tuning grid
my_grid <- expand.grid(
  penalty = 10 ^ seq(-3, -1, length = 5),
  mixture = (0:4) / 4
)

# define the CV splits
my_cv_splits <- vfold_cv(cbind_PCA, v = 2, repeats = 1)

这就是结果

# train the model
my_result <- tune_grid(
  object = my_recipe, 
  model = my_model, 
  resamples = my_cv_splits, 
  grid = my_grid, 
  control = control_grid(save_pred = TRUE)
)

# view results
collect_predictions(my_result)
#> # A tibble: 250 x 6
#>    id    .pred  .row penalty mixture     y
#>    <chr> <dbl> <int>   <dbl>   <dbl> <dbl>
#>  1 Fold1  6.21     2 0.001         0  9.85
#>  2 Fold1  6.21     2 0.00316       0  9.85
#>  3 Fold1  6.21     2 0.01          0  9.85
#>  4 Fold1  6.21     2 0.0316        0  9.85
#>  5 Fold1  6.63     4 0.001         0  6.10
#>  6 Fold1  6.63     4 0.00316       0  6.10
#>  7 Fold1  6.63     4 0.01          0  6.10
#>  8 Fold1  6.63     4 0.0316        0  6.10
#>  9 Fold1  6.15     7 0.001         0  3.44
#> 10 Fold1  6.15     7 0.00316       0  3.44
#> # ... with 240 more rows
collect_metrics(my_result)
#> # A tibble: 50 x 7
#>    penalty mixture .metric .estimator   mean     n std_err
#>      <dbl>   <dbl> <chr>   <chr>       <dbl> <int>   <dbl>
#>  1   0.001    0    rmse    standard   2.23       2  0.342 
#>  2   0.001    0    rsq     standard   0.182      2  0.0792
#>  3   0.001    0.25 rmse    standard   6.72       2  2.40  
#>  4   0.001    0.25 rsq     standard   0.0916     2  0.0626
#>  5   0.001    0.5  rmse    standard   7.16       2  3.07  
#>  6   0.001    0.5  rsq     standard   0.103      2  0.0830
#>  7   0.001    0.75 rmse    standard   7.24       2  3.15  
#>  8   0.001    0.75 rsq     standard   0.104      2  0.0825
#>  9   0.001    1    rmse    standard   7.27       2  3.19  
#> 10   0.001    1    rsq     standard   0.104      2  0.0824
#> # ... with 40 more rows

^{由reprex 包（v0.3.0）于 2020 年 3 月 22 日创建}

编辑：修改配方以避免数据泄漏

我修改了配方的定义，将所有 PCA 步骤合并为一个配方，现在它应该避免您提到的数据泄漏。

# packages
library(dplyr)
library(recipes)
library(parsnip)
library(tune)
library(rsample)

# data
set.seed(42)
df_x1 <- tibble(V1=runif(10, 1, 10), V2=runif(10, 1, 10))
df_x2 <- tibble(V3=runif(10, 1, 10), V4=runif(10, 1, 10))
df_x3 <- tibble(V5=runif(10, 1, 10), V6=runif(10, 1, 10))

y <- runif(10, 1, 10)
my_data <- cbind(y, df_x1, df_x2, df_x3)

# define the recipe
my_recipe <- recipe(y ~ ., data = my_data) %>% 
  step_center(all_predictors()) %>% 
  step_scale(all_predictors()) %>% 
  step_pca(V1, V2, threshold = 0.95, prefix = "group1_") %>% 
  step_pca(V3, V4, threshold = 0.95, prefix = "group2_") %>% 
  step_pca(V5, V6, threshold = 0.95, prefix = "group3_")

其余代码或多或少与以前相同。

# define the model
my_model <- linear_reg(penalty = tune(), mixture = tune()) %>% 
  set_engine("glmnet")

# define the tuning grid
my_grid <- expand.grid(
  penalty = 10 ^ seq(-3, -1, length = 5),
  mixture = (0:4) / 4
)

# define the CV splits
my_cv_splits <- vfold_cv(my_data, v = 2, repeats = 1)

# train the model
my_result <- tune_grid(
  object = my_recipe, 
  model = my_model, 
  resamples = my_cv_splits, 
  grid = my_grid, 
  control = control_grid(save_pred = TRUE)
)

^{由reprex 包（v0.3.0）于 2020-03-25 创建}

我对tidymodels和该环境中的所有软件包并不十分熟悉，所以这可能不是理想的解决方案，但我认为该过程确实有意义。

r - 在 R Tidymodels 中：将单独的 pca（预处理/食谱）应用于不同的数据帧，然后将它们合并以进行训练

1 回答 1

编辑：修改配方以避免数据泄漏

Related

Reference