3

目标

  1. 使用 MLR3 创建 LASSO 模型
  2. 使用带有内部 CV 或引导程序的嵌套 CV进行超参数 (lambda) 确定,使用外部 CV 进行模型性能评估(而不是只进行一次测试训练),并在不同模型实例中找到不同 LASSO 回归系数的标准偏差。
  3. 对尚不可用的测试数据集进行预测。

问题

  1. 我不确定在下面的代码中是否正确实现了所描述的嵌套 CV 方法。
  2. 我不确定 alpha 是否设置正确 alpha = 1 only。
  3. 在 mlr3 中使用重采样时,我不知道如何访问 LASSO lamda 系数。(mlr3learners 中的importance() 尚不支持LASSO)
  4. 我不知道如何将可能的模型应用于 mlr3 中不可用的测试集。

代码

library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)

# Data ------
# Prepared according to the Blog post by Julia Silge
# https://juliasilge.com/blog/lasso-the-office/
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]
#> Warning: Missing column names filled in: 'X1' [1]
#> Parsed with column specification:
#> cols(
#>   .default = col_double()
#> )
#> See spec(...) for full column specifications.

# Add a factor to data
data$factor = as.factor(c(rep('a', 20), rep('b', 50), rep('c', 30), rep('a', 6), rep('c', 10), rep('b', 20)))

# Task creation
task = 
  TaskRegr$new(
    id = 'office', 
    backend = data, 
    target = 'imdb_rating'
  )

# Model creation
graph = 
  po('scale') %>>% 
  po('encode') %>>% # make factors numeric
  # How to normalize predictors, leaving target unchanged?
  lrn('regr.cv_glmnet', # 10-fold CV for inner loop. Is alpha permanently set to 1?
       id = 'rp', alpha = 1, family = 'gaussian'
     ) 

graph_learner = GraphLearner$new(graph)

# Execution (actual modeling)
result = 
  resample(
    task, 
    graph_learner, 
    rsmp('cv', folds = 5) # 5-fold for outer CV 
  )
#> INFO  [13:21:53.485] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 3/5) 
#> INFO  [13:21:54.937] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 2/5) 
#> INFO  [13:21:55.242] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 1/5) 
#> INFO  [13:21:55.500] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 4/5) 
#> INFO  [13:21:55.831] Applying learner 'scale.encode.regr.cv_glmnet' on task 'office' (iter 5/5)

# How to access results, i.e. lamda coefficients, 
# and compare them (why no variable importance for glmnet)

# Access prediction
result$prediction()
#> <PredictionRegr> for 136 observations:
#>     row_id truth response
#>          2   8.3 8.373798
#>          6   8.7 8.455151
#>          9   8.4 8.358964
#> ---                      
#>        116   9.7 8.457607
#>        119   8.2 8.130352
#>        128   7.8 8.224150

reprex 包(v0.3.0)于 2020 年 6 月 11 日创建

编辑 1(LASSO 系数)

根据 misuse LASSO 系数的评论,可以通过result$data$learner[[1]]$model$rp$model$glmnet.fit$beta另外,我发现store_models = TRUE需要在结果中设置以存储模型并依次访问系数。

  • 尽管设置了 alpha = 1,我还是选择了多个 LASSO 系数。我想要“最佳”LASSO 系数(例如源自 lamda = lamda.min 或 lamda.1se)。不同的 s1, s2, s3, ... 是什么意思?这些是不同的lamdas吗?
  • 不同的系数确实似乎源于不同的 lambda 值,表示为 s1, s2 , s3, ... (数字是索引。)我想,可以通过首先找到“最佳”lambda 的索引来访问“最佳”系数index_lamda.1se = which(ft$lambda == ft$lambda.1se)[[1]]; index_lamda.min = which(ft$lambda == ft$lambda.min)[[1]]然后找到系数集。错误使用的评论中给出了一种更简洁的方法来找到“最佳”系数。
library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]

# Add a factor to data
data$factor = as.factor(c(rep('a', 20), rep('b', 50), rep('c', 30), rep('a', 6), rep('c', 10), rep('b', 20)))

# Task creation
task = 
  TaskRegr$new(
    id = 'office', 
    backend = data, 
    target = 'imdb_rating'
  )

# Model creation
graph = 
  po('scale') %>>% 
  po('encode') %>>% # make factors numeric
  # How to normalize predictors, leaving target unchanged?
  lrn('regr.cv_glmnet', # 10-fold CV for inner loop. Is alpha permanently set to 1?
      id = 'rp', alpha = 1, family = 'gaussian'
  ) 

graph$keep_results = TRUE
graph_learner = GraphLearner$new(graph)

# Execution (actual modeling)
result = 
  resample(
    task, 
    graph_learner, 
    rsmp('cv', folds = 5), # 5-fold for outer CV 
    store_models = TRUE # Store model needed to acces coefficients
  )

# LASSO coefficients 
# Why more than one coefficient per predictor? 
# What are s1, s2 etc.? Shouldn't 'lrn' fix alpha = 1?
# How to obtain the best coefficient (for lamda 1se or min) if multiple?
as.matrix(result$data$learner[[1]]$model$rp$model$glmnet.fit$beta)
#>                  s0          s1         s2         s3          s4         s5
#> andy              0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> angela            0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> b_j_novak         0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> brent_forrester   0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> darryl            0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> dwight            0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> episode           0 0.000000000 0.00000000 0.00000000 0.010297763 0.02170423
#> erin              0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> gene_stupnitsky   0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> greg_daniels      0 0.000000000 0.00000000 0.00000000 0.001845101 0.01309437
#> jan               0 0.000000000 0.00000000 0.00000000 0.005663699 0.01357832
#> jeffrey_blitz     0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> jennifer_celotta  0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> jim               0 0.006331732 0.01761548 0.02789682 0.036853510 0.04590513
#> justin_spitzer    0 0.000000000 0.00000000 0.00000000 0.000000000 0.00000000
#> [...]
#>                           s6           s7           s8          s9         s10
#> andy             0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> angela           0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> b_j_novak        0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> brent_forrester  0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> darryl           0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> dwight           0.002554576  0.007006995  0.011336058  0.01526851  0.01887180
#> episode          0.031963475  0.040864492  0.047487987  0.05356482  0.05910066
#> erin             0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> gene_stupnitsky  0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> greg_daniels     0.023040791  0.031866343  0.040170917  0.04779004  0.05472702
#> jan              0.021030152  0.028094541  0.035062678  0.04143812  0.04725379
#> jeffrey_blitz    0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> jennifer_celotta 0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> jim              0.053013058  0.058503984  0.062897112  0.06683734  0.07041964
#> justin_spitzer   0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> kelly            0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> ken_kwapis       0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> kevin            0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> lee_eisenberg    0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> michael          0.057190859  0.062963830  0.068766981  0.07394472  0.07865977
#> mindy_kaling     0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> oscar            0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> pam              0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> paul_feig        0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> paul_lieberstein 0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> phyllis          0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> randall_einhorn  0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> ryan             0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> season           0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> toby             0.000000000  0.000000000  0.005637169  0.01202893  0.01785309
#> factor.a         0.000000000 -0.003390125 -0.022365768 -0.03947047 -0.05505681
#> factor.b         0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#> factor.c         0.000000000  0.000000000  0.000000000  0.00000000  0.00000000
#>                           s11          s12          s13           s14
#> andy              0.000000000  0.000000000  0.000000000  0.0000000000
#> angela            0.000000000  0.000000000  0.000000000  0.0000000000
#> b_j_novak         0.000000000  0.000000000  0.000000000  0.0000000000
#> brent_forrester   0.000000000  0.000000000  0.000000000  0.0000000000
#> darryl            0.000000000  0.000000000  0.000000000  0.0017042281
#> dwight            0.022170870  0.025326337  0.027880703  0.0303865693
#> episode           0.064126846  0.069018240  0.074399623  0.0794693480
#> [...]

reprex 包(v0.3.0)于 2020 年 6 月 15 日创建

编辑 2(可选的后续问题)

嵌套 CV 提供了多个模型之间的差异评估。差异可以表示为外部 CV 获得的误差(例如 RMSE)。虽然该误差可能很小,但来自模型(由外部 CV 实例化)的单个 LASSO 系数(预测变量的重要性)可能会有很大差异。

  1. mlr3 是否提供描述预测变量定量重要性一致性的功能,即外部 CV 创建的模型中 LASSO 系数的 RMSE?还是应该创建一个自定义函数,使用result$data$learner[[i]]$model$rp$model$glmnet.fit$beta(由误用建议)检索 LASSO 系数,其中 i = 1、2、3、4、5 是外部 CV 的折叠,然后取匹配系数的 RMSE?
4

0 回答 0