4

我刚开始使用mlr3,对语法还是很不熟悉,有两个问题:

  1. 如何从 mlr3 中经过训练的 Logistic 回归中访问系数?
  2. 我正在处理一个极其不平衡的数据集,98% vs 2%,并且这个数据集中有超过 200 万行,我尝试使用 SMOTE 方法,但是它很慢,因为它可以在 python 中很快完成,所以是我的代码有什么错误吗?这是我的代码:
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
table(task$truth())

po_over = po("classbalancing",id="oversample",adjust="minor",reference="minor",shuffle=F,ratio=16)
table(po_over$train(list(task))$output$truth())

learner = mlr_learners$get("classif.rpart")
learner$predict_type = "prob"

learner = po_over %>>% learner

resampling = rsmp("holdout",ratio=0.8)

rr = resample(task,learner,resampling,store_models = T)

res <- rr$prediction()
auto1 <- autoplot(res)
auto2 <- autoplot(res,type='roc')

rr$score(msr("classif.acc"))$classif.acc %>% print()

对于 SMOTE:

gr_smote =
  po("colapply", id = "int_to_num",
    applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
  po("smote", dup_size = 15) %>>%
  po("colapply", id = "num_to_int",
    applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
4

1 回答 1

4

这是我为您的问题收集的内容 #1

  1. 创建一个包含大约 98% 的 1 和 2% 的 0 的数据集

  2. 制定训练和测试任务

  3. (1)创建overbalancing po的东西

    (2) 以这种方式创建学习器,原始代码中的方式不适用于 po

  4. 在训练集上训练学习者

  5. 在测试集上测试

library(mlr3)
library(dplyr)
library(mlr3pipelines)
set.seed(10)

pcs=data.frame(a=runif(1000), b=runif(1000))
pcs = pcs %>%
  mutate(c=2*a+3*b, d=ifelse(c>.6, 1, 0), navigator=factor(d)) %>%
  select(-c, -d)

task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)

task_train <- task$clone()$filter(train_set)
task_test  <- task$clone()$filter(test_set)

po_over1= po("classbalancing")
po_over1$param_set$values=list(ratio=16, reference="minor", adjust="minor", shuffle=FALSE)

learner=GraphLearner$new(
  po_over1 %>>% 
    po("learner", lrn("classif.rpart", 
                      predict_type="prob"))
)

learner$train(task_train)

pred=learner$predict(task_test)

输出:

learner$model
#' You can see the predicted probability by following the decision tree
#' e.g. say you have a data point a and b
#' first check that b>=.112 or b<.112 (nodes 2 and 3)
#' etc.
1) root 1085 304 1 (0.71981567 0.28018433)  
  2) b>=0.1122314 728  16 1 (0.97802198 0.02197802)  
    4) a>=0.007176245 709   0 1 (1.00000000 0.00000000) *
    5) a< 0.007176245 19   3 0 (0.15789474 0.84210526) *
  3) b< 0.1122314 357  69 0 (0.19327731 0.80672269)  
    6) a>=0.246552 65   0 1 (1.00000000 0.00000000) *
    7) a< 0.246552 292   4 0 (0.01369863 0.98630137) *

#Test predictions
pred$confusion
        truth
response   1   0
       1 195   1
       0   0   4

这是针对问题 #2 SMOTE

gr_smote =
  po("colapply", id = "int_to_num",
     applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
  po("smote", dup_size = 15) %>>%
  po("colapply", id = "num_to_int",
     applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))

learner=GraphLearner$new(
  gr_smote %>>% po("learner", lrn("classif.rpart", predict_type="prob"))
)
learner$train(task_train)
learner$model
1) root 1085 304 1 (0.7198157 0.2801843)  
  2) b>=0.5 391   0 1 (1.0000000 0.0000000) *
  3) b< 0.5 694 304 1 (0.5619597 0.4380403)  
    6) a>=0.5 203   0 1 (1.0000000 0.0000000) *
    7) a< 0.5 491 187 0 (0.3808554 0.6191446) *

pred=learner$predict(task_test)
pred$confusion
        truth
response   1   0
       1 159   0
       0  36   5
于 2021-03-15T00:13:38.317 回答