# Packages
library(dplyr)
library(recipes)
# toy dataset, with A being multicolored
df <- tibble(name = c("A", "A", "A", "B", "C"), color = c("green", "yellow", "purple", "green", "blue"))
#> # A tibble: 5 x 2
#> name color
#> <chr> <chr>
#> 1 A green
#> 2 A yellow
#> 3 A purple
#> 4 B green
#> 5 C blue
食谱步骤效果很好
dummified_df <- recipe(. ~ ., data = df) %>%
step_dummy(color, one_hot = TRUE) %>%
prep(training = df) %>%
juice()
#> # A tibble: 5 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 0 0
#> 2 A 0 0 0 1
#> 3 A 0 0 1 0
#> 4 B 0 1 0 0
#> 5 C 1 0 0 0
但我真正想要获得的结果是下面的结果,每行有一个观察结果,因为多色项目不再需要几行。
summarized_dummified_df <- dummified_df %>%
group_by(name) %>%
summarise_all(~ifelse(max(.) > 0, 1, 0)) %>%
ungroup()
#> # A tibble: 3 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 1 1
#> 2 B 0 1 0 0
#> 3 C 1 0 0 0
显然,我可以这样做。但是为了将我的配方步骤完全集成到tidymodels
生态系统中,例如使用工作流,如果我可以将不再需要重复的行分组,这要好得多,这要归功于配方中直接存在的虚拟变量。
是否有任何tidymodels 认可的方法来获得此结果?
我也尝试过这样做mlr3
,但无济于事,因为我找不到任何适合PipeOp
聚合行的方法。
library("mlr3")
library("mlr3pipelines")
task = TaskClassif$new("task",
data.table::data.table(
name = c("A", "A", "A", "B", "C"),
color = as.factor(c("green", "yellow", "purple", "green", "blue")),
price = as.factor(c("low", "low", "low", "high", "low"))),
"price"
)
poe = po("encode")
poe$train(list(task))[[1]]$data()
#> price name color.blue color.green color.purple color.yellow
#> 1: low A 0 1 0 0
#> 2: low A 0 0 0 1
#> 3: low A 0 0 1 0
#> 4: high B 0 1 0 0
#> 5: low C 1 0 0 0
我正在研究自定义step_
函数或自定义PipeOp
的创建,但我仍然觉得我缺少一些东西,因为我的数据类型对我来说并不罕见。