0

我正在“recipes”框架内对我的功能(使用“embed”包中的“step_woe”)应用 WOE(证据权重)转换,但默认情况下它采用 0 值作为参考,因此 WOE 值是相反的。

我正在尝试重新调整目标以将“1”设置为参考,但结果是相同的(灾难值的方向没有变化)。知道如何正确处理吗?

这是一个示例,首先我创建示例数据集,其中包含一个目标(0 和 1)和一个特征(“是”、“否”)彼此之间的完美关系。然后我应用 step_woe 转换,同时将参考水平设置为“0”或“1”以比较结果,没有差异。


library(tidyverse)
library(recipes)
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stringr':
#> 
#>     fixed
#> The following object is masked from 'package:stats':
#> 
#>     step
library(embed)
  
example_df <- 
  tibble(
    target  = rbinom(1000, 1, 0.5),
    feature = ifelse(target == 1, "yes", "no")
  ) %>% 
  mutate_all(as.factor) %>% 
  print()
#> # A tibble: 1,000 x 2
#>    target feature
#>    <fct>  <fct>  
#>  1 0      no     
#>  2 1      yes    
#>  3 0      no     
#>  4 0      no     
#>  5 1      yes    
#>  6 0      no     
#>  7 1      yes    
#>  8 1      yes    
#>  9 0      no     
#> 10 0      no     
#> # … with 990 more rows

woe_recipe_0 <- 
  recipe(target ~ feature, data = example_df) %>% 
  step_relevel(target, ref_level = "0") %>% 
  embed::step_woe(all_nominal_predictors(), outcome = "target") %>% 
  prep(., retain = FALSE)

tidy(woe_recipe_0, number = 2)
#> # A tibble: 2 x 10
#>   terms   value n_tot   n_0   n_1   p_0   p_1   woe outcome id       
#>   <chr>   <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>   <chr>    
#> 1 feature no      493   493     0     1     0  20.0 target  woe_nY7AB
#> 2 feature yes     507     0   507     0     1 -20.0 target  woe_nY7AB

woe_recipe_1 <- 
  recipe(target ~ feature, data = example_df) %>% 
  step_relevel(target, ref_level = "1") %>% 
  embed::step_woe(all_nominal_predictors(), outcome = "target") %>% 
  prep(., retain = FALSE)

tidy(woe_recipe_1, number = 2)
#> # A tibble: 2 x 10
#>   terms   value n_tot   n_0   n_1   p_0   p_1   woe outcome id       
#>   <chr>   <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>   <chr>    
#> 1 feature no      493   493     0     1     0  20.0 target  woe_Lt6pK
#> 2 feature yes     507     0   507     0     1 -20.0 target  woe_Lt6pK

sessionInfo()
#> R version 3.5.1 (2018-07-02)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Red Hat Enterprise Linux
#> 
#> Matrix products: default
#> BLAS: /opt/R/3.5.1/lib64/R/lib/libRblas.so
#> LAPACK: /opt/R/3.5.1/lib64/R/lib/libRlapack.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] embed_0.1.5     recipes_0.1.17  forcats_0.4.0   stringr_1.4.0  
#>  [5] dplyr_1.0.7     purrr_0.3.4     readr_1.3.1     tidyr_1.1.2    
#>  [9] tibble_3.0.4    ggplot2_3.3.5   tidyverse_1.3.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] httr_1.4.1            jsonlite_1.6          splines_3.5.1        
#>  [4] prodlim_2019.11.13    modelr_0.1.5          RcppParallel_5.0.2   
#>  [7] assertthat_0.2.1      highr_0.8             cellranger_1.1.0     
#> [10] yaml_2.2.0            ipred_0.9-12          pillar_1.6.2         
#> [13] backports_1.2.1       lattice_0.20-35       glue_1.5.1           
#> [16] reticulate_1.13       digest_0.6.27         rvest_0.3.5          
#> [19] colorspace_2.0-0      htmltools_0.4.0       Matrix_1.2-14        
#> [22] timeDate_3043.102     pkgconfig_2.0.3       broom_0.7.6          
#> [25] haven_2.2.0           scales_1.1.0          whisker_0.4          
#> [28] gower_0.2.1           lava_1.6.6            generics_0.1.0       
#> [31] ellipsis_0.3.2        withr_2.4.1           keras_2.2.5.0        
#> [34] nnet_7.3-12           cli_2.4.0             survival_2.42-3      
#> [37] magrittr_2.0.1        crayon_1.4.1          readxl_1.3.1         
#> [40] evaluate_0.14         fs_1.3.1              fansi_0.4.2          
#> [43] MASS_7.3-51.4         xml2_1.2.2            class_7.3-14         
#> [46] tools_3.5.1           hms_1.1.1             lifecycle_1.0.1      
#> [49] munsell_0.5.0         reprex_0.3.0          compiler_3.5.1       
#> [52] rlang_0.4.12          grid_3.5.1            rstudioapi_0.11      
#> [55] base64enc_0.1-3       rmarkdown_1.18        gtable_0.3.0         
#> [58] DBI_1.1.1             R6_2.5.0              tfruns_1.4           
#> [61] lubridate_1.7.4       knitr_1.26            tensorflow_2.0.0     
#> [64] uwot_0.1.5            utf8_1.2.1            zeallot_0.1.0        
#> [67] stringi_1.4.3         Rcpp_1.0.7            vctrs_0.3.8          
#> [70] rpart_4.1-15          dbplyr_2.1.1          tidyselect_1.1.1.9000
#> [73] xfun_0.11

reprex 包于 2022-02-02 创建(v0.3.0)

4

0 回答 0