对于大样本,我会尝试使用非 equi join in data.table
。样本中的基准比 tmfmnk 的答案差,但对于更大的数据集,它可能会占用更小的内存并且速度更快。
library(data.table)
library(microbenchmark)
intervals <-
data.table(
chr = c("chr1", "chr1", "chr1", "chr1"),
start = c(1, 101, 201, 301),
end = c(100, 200, 300, 400)
)
positions <-
data.table(
chr = rep("chr1", 8),
pos = c(50, 60, 110, 130, 160, 190, 350, 360),
sample = c(
"sampleA",
"sampleB",
"sampleA",
"sampleB",
"sampleA",
"sampleC",
"sampleB",
"sampleB"
)
)
# This takes only the ones we need:
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
# Benchmarking
microbenchmark(positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
, times = 50)
让我知道它是否有效。我进行了一些更改,以创建更接近您的用例的示例。还包括 Bryan 的基准测试案例。你的方法仍然是最快的。我对第 4 种方法(只有 2 个键)抱有希望,但它仍然较慢。
library(data.table)
library(microbenchmark)
library(GenomicRanges)
set.seed(14)
intervals <-
data.table(chr = paste0("chr", round(runif(
400000, min = 1, max = 22
))), start = round(runif(
n = 400000, min = 1, max = 100000000
)))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]
intervals$end <- intervals$start + 100L
# erase duplicates
intervals <-
unique(intervals[, .SD, .SDcols = c("chr", "start", "end")])
positions <-
data.table(
chr = paste0("chr", round(runif(
30000, min = 1, max = 22
))),
pos = round(runif(
n = 30000, min = 1, max = 100000000
)),
sample = sample(paste0("sample", 1:400), size = 30000, replace = T)
)
setkeyv(intervals, c("chr", "start", "end"))
positions[, pos2 := pos]
setkeyv(positions, c("chr", "pos", "pos2"))
microbenchmark({
interval.gr <-
GRanges(intervals$chr, IRanges(intervals$start, intervals$end))
positions.gr <-
GRanges(positions$chr, IRanges(positions$pos, positions$pos))
ov <- findOverlaps(interval.gr, positions.gr)
res1 <- intervals %>%
slice(queryHits(ov)) %>%
mutate(sample = positions$sample[subjectHits(ov)]) %>%
group_by(chr, start, end) %>%
summarise(n_sample = length(unique(sample))) %>% data.table(.)
intervals_1 <-
rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { interval.gr <- GRanges(intervals$chr, IRanges(intervals$start, intervals$end)) positions.gr <- GRanges(positions$chr, IRanges(positions$pos, positions$pos)) ov <- findOverlaps(interval.gr, positions.gr) res1 <- intervals %>% slice(queryHits(ov)) %>% mutate(sample = positions$sample[subjectHits(ov)]) %>% group_by(chr, start, end) %>% summarise(n_sample = length(unique(sample))) %>% data.table(.) intervals_1 <- rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 148.0612 164.4884 225.8665 178.5495 219.6038 585.5995 50
microbenchmark({
# This takes only the ones we need:
res2 <-
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
intervals_2 <-
rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { res2 <- positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(chr = i.chr, start = i.start, end = i.end, sample = x.sample)][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")] intervals_2 <- rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 294.4081 338.0309 447.0925 383.2813 548.3734 883.4389 50
microbenchmark({
# Bryan's approach
overlap = foverlaps(intervals,
positions,
type = "any",
by.x = c("chr", "start", "end")) ## return overlap indices
res3 <-
overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)]
intervals_3 <-
rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { overlap = foverlaps(intervals, positions, type = "any", by.x = c("chr", "start", "end")) res3 <- overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)] intervals_3 <- rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 221.907 280.8795 429.961 398.0075 475.0276 1017.866 50
## Starting by 1
set.seed(14)
intervals <-
data.table(chr = paste0("chr", round(runif(
400000, min = 1, max = 22
))), start = round(runif(
n = 400000, min = 1, max = 100000000
)))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]
positions[, start := floor(pos / 100) * 100 + 1L]
positions <-
unique(positions[, .SD, .SDcols = c("chr", "start", "sample")])
setkeyv(intervals, c("chr", "start"))
setkeyv(positions, c("chr", "start"))
microbenchmark({
intervals_4 <-
positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")]#
# add end
intervals_4[, end := start + 100L]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { intervals_4 <- positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")] intervals_4[, `:=`(end, start + 100L)] }
#> min lq mean median uq max neval
#> 419.1001 443.9567 546.0186 478.5985 650.4682 1115.179 50
由reprex 包(v0.3.0)于 2019 年 9 月 25 日创建