首先,vecseq
它不会作为可见函数从 中导出data.table
,因此它的语法和/或行为可能会在未来对包的更新中发生更改而不会发出警告。此外,除了最后的简单检查之外,这是未经测试的。identical
顺便说一句,我们需要一个更大的例子来展示与矢量扫描方法的区别:
require(data.table)
n <- 1e5L
f <- 10L
ni <- n / f
set.seed(54321)
DT <- data.table(x = 1:n + sample(-f:f, n, replace = TRUE))
IT <- data.table(i = 1:ni,
min = seq(from = 1L, to = n, by = f) + sample(0:4, ni, replace = TRUE),
max = seq(from = 1L, to = n, by = f) + sample(5:9, ni, replace = TRUE))
DT
,数据表是 的一个不太随机的子集1:n
。 IT
,区间表是 中的ni = n / 10
非重叠区间1:n
。对所有间隔进行重复矢量扫描ni
需要一段时间:
system.time({
ans.vecscan <- IT[, DT[x >= min & x <= max], by = i]
})
## user system elapsed
## 84.15 4.48 88.78
可以在区间端点上执行两个滚动连接(请参阅 中的roll
参数?data.table
)以一举获得所有内容:
system.time({
# Save time if DT is already keyed correctly
if(!identical(key(DT), "x")) setkey(DT, x)
DT[, row := .I]
setkey(IT, min)
target.low <- IT[DT, roll = Inf, nomatch = 0][, list(min = row[1]), keyby = i]
# Non-overlapping intervals => (sorted by min => sorted by max)
setattr(IT, "sorted", "max")
target.high <- IT[DT, roll = -Inf, nomatch = 0][, list(max = last(row)), keyby = i]
target <- target.low[target.high, nomatch = 0]
target[, len := max - min + 1L]
rm(target.low, target.high)
ans.roll <- DT[data.table:::vecseq(target$min, target$len, NULL)][, i := unlist(mapply(rep, x = target$i, times = target$len, SIMPLIFY=FALSE))]
ans.roll[, row := NULL]
setcolorder(ans.roll, c("i", "x"))
})
## user system elapsed
## 0.12 0.00 0.12
确保相同的行顺序验证结果:
setkey(ans.vecscan, i, x)
setkey(ans.roll, i, x)
identical(ans.vecscan, ans.roll)
## [1] TRUE