这是我为您的问题收集的内容 #1
创建一个包含大约 98% 的 1 和 2% 的 0 的数据集
制定训练和测试任务
(1)创建overbalancing po的东西
(2) 以这种方式创建学习器,原始代码中的方式不适用于 po
在训练集上训练学习者
在测试集上测试
library(mlr3)
library(dplyr)
library(mlr3pipelines)
set.seed(10)
pcs=data.frame(a=runif(1000), b=runif(1000))
pcs = pcs %>%
mutate(c=2*a+3*b, d=ifelse(c>.6, 1, 0), navigator=factor(d)) %>%
select(-c, -d)
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)
task_train <- task$clone()$filter(train_set)
task_test <- task$clone()$filter(test_set)
po_over1= po("classbalancing")
po_over1$param_set$values=list(ratio=16, reference="minor", adjust="minor", shuffle=FALSE)
learner=GraphLearner$new(
po_over1 %>>%
po("learner", lrn("classif.rpart",
predict_type="prob"))
)
learner$train(task_train)
pred=learner$predict(task_test)
输出:
learner$model
#' You can see the predicted probability by following the decision tree
#' e.g. say you have a data point a and b
#' first check that b>=.112 or b<.112 (nodes 2 and 3)
#' etc.
1) root 1085 304 1 (0.71981567 0.28018433)
2) b>=0.1122314 728 16 1 (0.97802198 0.02197802)
4) a>=0.007176245 709 0 1 (1.00000000 0.00000000) *
5) a< 0.007176245 19 3 0 (0.15789474 0.84210526) *
3) b< 0.1122314 357 69 0 (0.19327731 0.80672269)
6) a>=0.246552 65 0 1 (1.00000000 0.00000000) *
7) a< 0.246552 292 4 0 (0.01369863 0.98630137) *
#Test predictions
pred$confusion
truth
response 1 0
1 195 1
0 0 4
这是针对问题 #2 SMOTE
gr_smote =
po("colapply", id = "int_to_num",
applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
po("smote", dup_size = 15) %>>%
po("colapply", id = "num_to_int",
applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
learner=GraphLearner$new(
gr_smote %>>% po("learner", lrn("classif.rpart", predict_type="prob"))
)
learner$train(task_train)
learner$model
1) root 1085 304 1 (0.7198157 0.2801843)
2) b>=0.5 391 0 1 (1.0000000 0.0000000) *
3) b< 0.5 694 304 1 (0.5619597 0.4380403)
6) a>=0.5 203 0 1 (1.0000000 0.0000000) *
7) a< 0.5 491 187 0 (0.3808554 0.6191446) *
pred=learner$predict(task_test)
pred$confusion
truth
response 1 0
1 159 0
0 36 5