我在一个小样本上对 Anthony Damico、Brian Diggs 和 data_steve 的答案进行了基准测试,tbl_df
得到了以下结果。
> data <- data.frame('a' = 1:3,
+ 'b' = c('a','b','c'),
+ 'c' = c('d', 'e', 'f'),
+ 'd' = c('g', 'h', 'i'))
> data <- tbl_df(data)
> cols <- c("b", "c", "d")
> microbenchmark(
+ do.call(paste, c(data[cols], sep="-")),
+ apply( data[ , cols ] , 1 , paste , collapse = "-" ),
+ tidyr::unite_(data, "x", cols, sep="-")$x,
+ times=1000
+ )
Unit: microseconds
expr min lq mean median uq max neval
do.call(paste, c(data[cols], sep = "-")) 65.248 78.380 93.90888 86.177 99.3090 436.220 1000
apply(data[, cols], 1, paste, collapse = "-") 223.239 263.044 313.11977 289.514 338.5520 743.583 1000
tidyr::unite_(data, "x", cols, sep = "-")$x 376.716 448.120 556.65424 501.877 606.9315 11537.846 1000
然而,当我自己评估tbl_df
大约 100 万行和 10 列时,结果却大不相同。
> microbenchmark(
+ do.call(paste, c(data[c("a", "b")], sep="-")),
+ apply( data[ , c("a", "b") ] , 1 , paste , collapse = "-" ),
+ tidyr::unite_(data, "c", c("a", "b"), sep="-")$c,
+ times=25
+ )
Unit: milliseconds
expr min lq mean median uq max neval
do.call(paste, c(data[c("a", "b")], sep="-")) 930.7208 951.3048 1129.334 997.2744 1066.084 2169.147 25
apply( data[ , c("a", "b") ] , 1 , paste , collapse = "-" ) 9368.2800 10948.0124 11678.393 11136.3756 11878.308 17587.617 25
tidyr::unite_(data, "c", c("a", "b"), sep="-")$c 968.5861 1008.4716 1095.886 1035.8348 1082.726 1759.349 25