我想粘贴列名及其值。它必须基于某些条件(if 语句),它可以基于单个变量或多个变量。
下面是一个小例子,展示了数据的样子。我想加快这个过程并获得与 fun2、fun3 和 fun4 相同的结果。
为了使这一点尽可能简单,如果 a、b、c 和 d 列的值大于零,则只有一个规则可以设置为缺失。但是,我留下了规则的名称,因为它可以不同,比如“规则 1”> 0 和“规则 2”如果是非缺失的。
library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
这是数据的样子:
id t1 t2 a b c d re
1 0.6883367 -0.3454049 0 0 0 0 ''
2 -1.0653127 -1.3035077 NA NA NA NA ''
3 0.5210550 0.8489376 0 NA 0 1 ''
4 0.3697369 -0.1135827 1 0 NA 1 ''
5 1.3195759 -1.5431305 0 1 0 0 ''
6 -0.2106836 -0.3421900 NA 0 1 1 ''
7 -0.2258871 -2.1644697 1 1 NA 0 ''
8 -0.7132686 1.7673775 1 NA 1 1 ''
9 0.9467068 1.8188665 0 1 1 NA ''
10 -0.3900479 1.7306935 1 1 1 1 ''
波纹管是所需的输出。这个想法是保留一列,其中包含一些值已设置为缺失的原因的描述。在此示例中,只有前两个人同时拥有 t1 和 t2 的记录。个人 1、2 和 3 有 t1 的记录,而个人 1、2、5、7 和 9 有 t2 的记录。
id t1 t2 a b c d re
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1);"
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1);"
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1);"
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"
第一次尝试 (fun1)。不是预期的结果,因为它在 mutate 中查找单个空格。所有其他函数(fun2、fun3 和 fun4)打印正确的结果。
fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 NA 1.43 0 0 0 0 "Rule1:t1( ); "
2 NA 0.733 NA NA NA NA "Rule1:t1( ); "
3 NA NA 0 NA 0 1 "Rule2:t2(d=1); Rule1:t1( ); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 2 (fun2) 使用“trimws”。
fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 3 (fun3) 使用带有正则表达式的“gsub”。
fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "
函数 4 (fun4) 在 mutate 中使用 stri_detect 和正则表达式。
fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
使用更多数据进行基准测试
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
fun1(dat) 100 642 653 660 668 666 774 66800 1.00
fun2(dat) 100 742 756 763 773 768 874 77300 1.16
fun3(dat) 100 765 779 785 794 791 903 79400 1.19
fun4(dat) 100 743 756 763 777 770 1010 77700 1.16
有没有人知道如何加快这个过程?
谢谢你。