0

我有一个包含连续变量和分类变量的数据集。我正在运行回归以根据数据集中的其他变量预测其中一个变量。在比较 ridge、lasso 和 elastic-net 回归的结果后,lasso 回归是最好的模型。

我使用“coef”函数来提取模型的系数,但是,结果是一个非常长的列表,包含 800 多个变量(因为我的一些分类变量有很多级别)。有没有一种方法可以快速将系数从大到小排序?这是一个 glmnet 模型输出

示例代码的可重现问题:

# Libraries Needed
library(caret)
library(glmnet)
library(mlbench)
library(psych)

# Data
data("BostonHousing")
data <- BostonHousing
str(data)

# Data Partition
set.seed(222)
ind <- sample(2, nrow(data), replace = T, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]

# Custom Control Parameters
custom <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 5,
                       verboseIter = T)

# Linear Model
set.seed(1234)
lm <- train(medv ~.,
            train,
            method='lm',
            trControl = custom)

# Results
lm$results
lm
summary(lm)
plot(lm$finalModel)

# Ridge Regression
set.seed(1234)
ridge <- train(medv ~.,
               train,
               method = 'glmnet',
               tuneGrid = expand.grid(alpha = 0,
                                      lambda = seq(0.0001, 1, length=5)),#try 5 values for lambda between 0.0001 and 1
                                      trControl=custom)
#increasing lambda = increasing penalty and vice versa
#increase lambda therefore will cause coefs to shrink

# Plot Results
plot(ridge)
plot(ridge$finalModel, xvar = "lambda", label = T)
plot(ridge$finalModel, xvar = 'dev', label=T)
plot(varImp(ridge, scale=T))

# Lasso Regression
set.seed(1234)
lasso <- train(medv ~.,
               train,
               method = 'glmnet',
               tuneGrid = expand.grid(alpha=1,
                                      lambda = seq(0.0001,1, length=5)),
               trControl = custom)

# Plot Results
plot(lasso)
lasso
plot(lasso$finalModel, xvar = 'lambda', label=T)
plot(lasso$finalModel, xvar = 'dev', label=T)
plot(varImp(lasso, scale=T))

# Elastic Net Regression
set.seed(1234)
en <- train(medv ~.,
            train,
            method = 'glmnet',
            tuneGrid = expand.grid(alpha = seq(0,1,length=10),
                                   lambda = seq(0.0001,1,length=5)),
            trControl = custom)

# Plot Results
plot(en)
plot(en$finalModel, xvar = 'lambda', label=T)
plot(en$finalModel, xvar = 'dev', label=T)
plot(varImp(en))

# Compare Models
model_list <- list(LinearModel = lm, Ridge = ridge, Lasso = lasso, ElasticNet=en)
res <- resamples(model_list)
summary(res)
bwplot(res)
xyplot(res, metric = 'RMSE')

# Best Model
en$bestTune
best <- en$finalModel
coef(best, s = en$bestTune$lambda)
4

1 回答 1

0

对于大多数模型,您所要做的就是:

sort(coef(model), decreasing=TRUE)

因为你使用glmnet它有点复杂。我将在此处复制您的示例的最小版本(其他模型、绘图等不是必需的,以便我们能够重现您的问题......)

## Packages
library(caret)
library(glmnet)
library(mlbench) ## for BostonHousing data
# Data
data("BostonHousing")
data <- BostonHousing
# Data Partition
set.seed(222)
ind <- sample(2, nrow(data), replace = TRUE, prob = c(0.7, 0.3))
train <- data[ind==1,]
test <- data[ind==2,]
# Custom Control Parameters
custom <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 5,
                       verboseIter = TRUE)
# Elastic Net Regression
set.seed(1234)
en <- train(medv ~.,
            train,
            method = 'glmnet',
            tuneGrid = expand.grid(alpha = seq(0,1,length=10),
                                   lambda = seq(0.0001,1,length=5)),
            trControl = custom)
# Best Model
best <- en$finalModel
coefs <- coef(best, s = en$bestTune$lambda)

(这可能会更简单:例如,您真的需要自定义控制参数来向我们展示示例吗?如果不使用caret- 只需使用 `glmnet - 这会更简单 - 但我担心我可能会遗漏一些东西。)

一旦你得到了系数,排序似乎确实有效,尽管有一条关于可能效率低下的消息:

sort(coefs, decreasing=TRUE)
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
##  [1]  25.191049410   5.078589706   1.389548822   0.244605193   0.045600250
##  [6]   0.008840485   0.004372752  -0.012701593  -0.028337745  -0.162794401
## [11]  -0.335062819  -0.901475516  -1.395091095 -12.632336419

sort(as.numeric(coefs))似乎也可以正常工作。

如果您想对整个矩阵进行排序(即保留所有惩罚级别的值),您可以利用惩罚不会改变参数排名顺序的事实:

coeftab <-coef(best)
lastvals <- coeftab[,ncol(coeftab)]
coeftab_s <- coeftab[order(lastvals,decreasing=TRUE),]
## plot, leaving out the intercept
matplot(t(coeftab_s)[,-1],type="l")
于 2020-05-31T19:22:00.043 回答