# split the `messy_string` and create a long table, keeping track of the id
DT2 <- setkey(DT[, list(val=unlist(strsplit(messy_string, "\\$"))), by=list(ID, messy_string)], "val")
# add the columns, initialize to 0
DT2[, c(elements_list) := 0L]
# warning expected, re:adding large ammount of columns
# iterate over each value in element_list, assigning 1's ass appropriate
for (el in elements_list)
DT2[el, c(el) := 1L]
# sum by ID
DT2[, lapply(.SD, sum), by=list(ID, messy_string), .SDcols=elements_list]
请注意,我们正在携带该messy_string
列,因为它比留下它然后join
在 ID 上取回它更便宜。如果您在最终输出中不需要它,只需将其删除即可。
基准:
创建示例数据:
# sample data, using OP's exmple
set.seed(10)
N <- 1e6 # number of rows
elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))
messy_string_vec <- random_string_fast(N, 2, 5, "$") # Create the messy strings in a single shot.
masterDT <- data.table(ID = c(1:N), messy_string = messy_string_vec, key="ID") # create the data.table
旁注
一次创建所有随机字符串并将结果分配为单个列比调用函数 N 次并一一分配要快得多。
# Faster way to create the `messy_string` 's
random_string_fast <- function(N, min_length, max_length, separator) {
ints <- seq(from=min_length, to=max_length)
replicate(N, paste(sample(elements_list, sample(ints)), collapse=separator))
}
比较四种方法:
- 这个答案——“DT.RS”
- @eddi 的回答——“Plyr.eddi”
- @GeekTrader 的回答——DT.GT
- GeekTrader 对一些修改的回答——DT.GT_Mod
这是设置:
library(data.table); library(plyr); library(microbenchmark)
# data.table method - RS
usingDT.RS <- quote({DT <- copy(masterDT);
DT2 <- setkey(DT[, list(val=unlist(strsplit(messy_string, "\\$"))), by=list(ID, messy_string)], "val"); DT2[, c(elements_list) := 0L]
for (el in elements_list) DT2[el, c(el) := 1L]; DT2[, lapply(.SD, sum), by=list(ID, messy_string), .SDcols=elements_list]})
# data.table method - GeekTrader
usingDT.GT <- quote({dt <- copy(masterDT); myFunc()})
# data.table method - GeekTrader, modified by RS
usingDT.GT_Mod <- quote({dt <- copy(masterDT); myFunc.modified()})
# ply method from below
usingPlyr.eddi <- quote({dt <- copy(masterDT); indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i) dt[i, data.frame(t(as.matrix(table(strsplit(messy_string, split = "\\$"))))) ]));
dt = cbind(dt, indicators); dt[is.na(dt)] = 0; dt })
以下是基准测试结果:
microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), usingPlyr.eddi=eval(usingPlyr.eddi), times=5L)
On smaller data:
N = 600
Unit: milliseconds
expr min lq median uq max
1 usingDT.GT 1189.7549 1198.1481 1200.6731 1202.0972 1203.3683
2 usingDT.GT_Mod 581.7003 591.5219 625.7251 630.8144 650.6701
3 usingDT.RS 2586.0074 2602.7917 2637.5281 2819.9589 3517.4654
4 usingPlyr.eddi 2072.4093 2127.4891 2225.5588 2242.8481 2349.6086
N = 1,000
Unit: seconds
expr min lq median uq max
1 usingDT.GT 1.941012 2.053190 2.196100 2.472543 3.096096
2 usingDT.RS 3.107938 3.344764 3.903529 4.010292 4.724700
3 usingPlyr 3.297803 3.435105 3.625319 3.812862 4.118307
N = 2,500
Unit: seconds
expr min lq median uq max
1 usingDT.GT 4.711010 5.210061 5.291999 5.307689 7.118794
2 usingDT.GT_Mod 2.037558 2.092953 2.608662 2.638984 3.616596
3 usingDT.RS 5.253509 5.334890 6.474915 6.740323 7.275444
4 usingPlyr.eddi 7.842623 8.612201 9.142636 9.420615 11.102888
N = 5,000
expr min lq median uq max
1 usingDT.GT 8.900226 9.058337 9.233387 9.622531 10.839409
2 usingDT.GT_Mod 4.112934 4.293426 4.460745 4.584133 6.128176
3 usingDT.RS 8.076821 8.097081 8.404799 8.800878 9.580892
4 usingPlyr.eddi 13.260828 14.297614 14.523016 14.657193 16.698229
# dropping the slower two from the tests:
microbenchmark( usingDT.RS=eval(usingDT.RS), usingDT.GT=eval(usingDT.GT), usingDT.GT_Mod=eval(usingDT.GT_Mod), times=6L)
N = 10,000
Unit: seconds
expr min lq median uq max
1 usingDT.GT_Mod 8.426744 8.739659 8.750604 9.118382 9.848153
2 usingDT.RS 15.260702 15.564495 15.742855 16.024293 16.249556
N = 25,000
... (still running)
-----------------
基准测试中使用的函数:
# original random string function
random_string <- function(min_length, max_length, separator) {
selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)
return(selection)
}
# GeekTrader's function
myFunc <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")
COLS <- do.call(rbind,
lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]]))
)
}
)
)
res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
dt <- cbind(dt, res)
for (j in names(dt))
set(dt,which(is.na(dt[[j]])),j,0)
return(dt)
}
# Improvements to @GeekTrader's `myFunc` -RS '
myFunc.modified <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")
## MODIFICATIONS:
# using `rbindlist` instead of `do.call(rbind.. )`
COLS <- rbindlist( lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]])),
# MODICIATION: Not coercing to factors
stringsAsFactors = FALSE
)
}
)
)
# MODIFICATION: Preserve as matrix, the output of tapply
res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )
# FLATTEN into a data.table
resdt <- data.table(r=c(res2))
# FIND & REPLACE NA's of single column
resdt[is.na(r), r:=0L]
# cbind with dt, a matrix, with the same attributes as `res2`
cbind(dt,
matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
}
### Benchmarks comparing the two versions of GeekTrader's function:
orig = quote({dt <- copy(masterDT); myFunc()})
modified = quote({dt <- copy(masterDT); myFunc.modified()})
microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)
# Unit: milliseconds
# expr min lq median uq max
# 1 Modified 895.025 971.0117 1011.216 1189.599 2476.972
# 2 Orig 1953.638 2009.1838 2106.412 2230.326 2356.802