r - 如何根据某些条件优化粘贴单个/多个列名及其值

Question

我想粘贴列名及其值。它必须基于某些条件（if 语句），它可以基于单个变量或多个变量。

下面是一个小例子，展示了数据的样子。我想加快这个过程并获得与 fun2、fun3 和 fun4 相同的结果。

为了使这一点尽可能简单，如果 a、b、c 和 d 列的值大于零，则只有一个规则可以设置为缺失。但是，我留下了规则的名称，因为它可以不同，比如“规则 1”> 0 和“规则 2”如果是非缺失的。

library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")

dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")

这是数据的样子：

id         t1         t2  a  b  c  d re
 1  0.6883367 -0.3454049  0  0  0  0 '' 
 2 -1.0653127 -1.3035077 NA NA NA NA '' 
 3  0.5210550  0.8489376  0 NA  0  1 '' 
 4  0.3697369 -0.1135827  1  0 NA  1 '' 
 5  1.3195759 -1.5431305  0  1  0  0 '' 
 6 -0.2106836 -0.3421900 NA  0  1  1 '' 
 7 -0.2258871 -2.1644697  1  1 NA  0 '' 
 8 -0.7132686  1.7673775  1 NA  1  1 '' 
 9  0.9467068  1.8188665  0  1  1 NA '' 
10 -0.3900479  1.7306935  1  1  1  1 ''

波纹管是所需的输出。这个想法是保留一列，其中包含一些值已设置为缺失的原因的描述。在此示例中，只有前两个人同时拥有 t1 和 t2 的记录。个人 1、2 和 3 有 t1 的记录，而个人 1、2、5、7 和 9 有 t2 的记录。

id       t1     t2     a     b     c     d    re                                      
 1  -0.182   1.43      0     0     0     0   ""                                      
 2  -1.31    0.733    NA    NA    NA    NA   ""                                      
 3  -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1);"                       
 4  NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1);"        
 5  NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
 6  NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1);"        
 7  NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
 8  NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"   
 9  NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
10  NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"

第一次尝试 (fun1)。不是预期的结果，因为它在 mutate 中查找单个空格。所有其他函数（fun2、fun3 和 fun4）打印正确的结果。

fun1 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id    t1     t2     a     b     c     d    re                                      
<int> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1    NA   1.43     0     0     0     0   "Rule1:t1(  ); "                        
   2    NA   0.733   NA    NA    NA    NA   "Rule1:t1(  ); "                        
   3    NA  NA        0    NA     0     1   "Rule2:t2(d=1); Rule1:t1(  ); "         
   4    NA  NA        1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5    NA   1.78     0     1     0     0   "Rule1:t1( b=1 ); "                     
   6    NA  NA       NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7    NA  -0.345    1     1    NA     0   "Rule1:t1(a=1 b=1 ); "                  
   8    NA  NA        1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9    NA  -1.22     0     1     1    NA   "Rule1:t1( b=1 c=1); "                  
  10    NA  NA        1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数 2 (fun2) 使用“trimws”。

fun2 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0   ""                                      
   2 -1.31    0.733    NA    NA    NA    NA   ""                                      
   3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
   5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
   6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
   7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
   8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
  10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数 3 (fun3) 使用带有正则表达式的“gsub”。

fun3 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := gsub("\\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
  1 -0.182   1.43      0     0     0     0   ""                                      
  2 -1.31    0.733    NA    NA    NA    NA   ""                                      
  3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
  4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
  5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
  6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
  7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1b=1); "                   
  8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1c=1); "   
  9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1c=1); "                   
 10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "

函数 4 (fun4) 在 mutate 中使用 stri_detect 和正则表达式。

fun4 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0 ""                                      
   2 -1.31    0.733    NA    NA    NA    NA ""                                      
   3 -0.0613 NA         0    NA     0     1 "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1 "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5 NA       1.78      0     1     0     0 "Rule1:t1( b=1 ); "                     
   6 NA      NA        NA     0     1     1 "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7 NA      -0.345     1     1    NA     0 "Rule1:t1(a=1 b=1 ); "                  
   8 NA      NA         1    NA     1     1 "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA "Rule1:t1( b=1 c=1); "                  
  10 NA      NA         1     1     1     1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

使用更多数据进行基准测试

n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")

benchmark(fun1(dat),
          fun2(dat),
          fun3(dat),
          fun4(dat))

Benchmark summary:
  Time units : milliseconds 
     expr n.eval min lw.qu median mean up.qu  max total relative
fun1(dat)    100 642   653    660  668   666  774 66800     1.00
fun2(dat)    100 742   756    763  773   768  874 77300     1.16
fun3(dat)    100 765   779    785  794   791  903 79400     1.19
fun4(dat)    100 743   756    763  777   770 1010 77700     1.16

有没有人知道如何加快这个过程？

谢谢你。

score 3 · Accepted Answer

在前面，我承认我无法通过基准测试（感谢挑战）。可能有一些方法可以加快速度，但让我推荐一种方法，它可以做同样的事情（对于较小的数据更快，对于大数据也差不多）但支持每个规则的功能。这不是您直接询问的内容，而是您暗示了每个规则的不同功能。

（我已经更新了代码，感谢@Cole 找到了我早期探索的残余。）

RULES <- list(
  Rule1 = list(
    rule = "Rule1",
    lhs = "t1",
    rhs = c("a", "b", "c"),
    fun = function(z) !is.na(z) & z > 0
  ),
  Rule2 = list(
    rule = "Rule2",
    lhs = "t2",
    rhs = "d",
    fun = is.na
    )
)

fun9 <- function(dat, RULES = list()) {
  nr <- nrow(dat)
  # RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
  RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
  for (r in seq_along(RULES)) {
    fun <- RULES[[r]]$fun
    lhs <- RULES[[r]]$lhs
    for (rhs in RULES[[r]]$rhs) {
      lgl <- do.call(fun, list(dat[[rhs]]))
      set(dat, which(lgl), lhs, NA)
      RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
    }
    ind <- nzchar(RE[[r]])
    RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
  }
  set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}

RULES和使用的前提fun9应该是不言而喻的。

用小数据进行基准测试似乎很有希望：

set.seed(2021)
dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")
fun9(dat, RULES)[]
#        id         t1         t2     a     b     c     d                                re
#     <int>      <num>      <num> <num> <num> <num> <num>                            <char>
#  1:     1 -0.1224600 -1.0822049     0     0     0     0                                 ;
#  2:     2  0.5524566         NA    NA    NA    NA    NA                   ;Rule2:t2( d=1)
#  3:     3  0.3486495  0.1819954     0    NA     0     1                                 ;
#  4:     4         NA  1.5085418     1     0    NA     1                   Rule1:t1( a=1);
#  5:     5         NA  1.6044701     0     1     0     0                   Rule1:t1( b=1);
#  6:     6         NA -1.8414756    NA     0     1     1                   Rule1:t1( c=1);
#  7:     7         NA  1.6233102     1     1    NA     0               Rule1:t1( a=1 b=1);
#  8:     8         NA  0.1313890     1    NA     1     1               Rule1:t1( a=1 c=1);
#  9:     9         NA         NA     0     1     1    NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10:    10         NA  1.5133183     1     1     1     1           Rule1:t1( a=1 b=1 c=1);

bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time             gc                  
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>           <list>              
# 1 fun4(dat)          9.52ms   11.1ms      88.5     316KB     2.06    43     1      486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]>    <tibble [44 x 3]>   
# 2 fun9(dat, RULES)   97.5us  113.5us    7760.       416B     6.24  3731     3      481ms <NULL> <Rprofmem[,3] [2 x 3]>  <bch:tm [3,734]> <tibble [3,734 x 3]>

从开始`itr/sec`，这fun9看起来要快一些。

使用更大的数据：

set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                   time         gc              
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                   <list>       <list>          
# 1 fun4(dat)           1.24s    1.24s     0.806    62.9MB     1.61     1     2      1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms  315.4ms     3.17     53.8MB     4.76     2     3    630.8ms <NULL> <Rprofmem[,3] [70 x 3]>  <bch:tm [2]> <tibble [2 x 3]>

虽然此解决方案不使用tidytable或其流动，但它更快。清理re是另一个步骤，可能会使这个速度回到凡人的水平:-)。

旁注：我试图使用lapply,和其他技巧在mget数据环境中做事，但最后，使用( https://stackoverflow.com/a/16846530/3358272 ) 和简单向量似乎是最快的.data.tabledata.table::set

r - 如何根据某些条件优化粘贴单个/多个列名及其值

1 回答 1

Related

Reference