r - R：计算每个间隔的唯一元素

Question

假设我有一个不重叠的基因组区间列表。

chr1    1   100
chr1    101 200
chr1    201 300
chr1    301 400

以及与不同样本相关的基因组位置列表：

chr1    50  sampleA
chr1    60  sampleB
chr1    110 sampleA
chr1    130 sampleB
chr1    160 sampleA
chr1    190 sampleC
chr1    350 sampleB
chr1    360 sampleB

我的目标是计算每个间隔的唯一样本数。在我的真实数据集中，间隔表是 ~400.000 行，基因组位置样本表是 ~30.000 行。

该计算嵌入在模拟中，因此它应该尽可能快。我已经尝试过使用 GenomicRanges 作为：

require(GenomicRanges)
interval.gr <- GRanges(intervals$chr,IRanges(intervals$start,intervals$end))
positions.gr <- GRanges(positions$chr,IRanges(positions$pos,positions$pos))
ov <- findOverlaps(interval.gr,positions.gr)
intervals %>%
  slice(queryHits(ov)) %>%
  mutate(sample=positions$sample[subjectHits(ov)]) %>% 
  group_by(chr,start,end) %>% 
  summarise(n_sample=length(unique(sample)))

结果是

# A tibble: 3 x 4
# Groups:   chr, start [3]
  chr   start   end n_sample
  <fct> <dbl> <dbl>    <int>
1 chr1      1   100        2
2 chr1    101   200        3
3 chr1    301   400        1

然而，它仍然会在没有样本的情况下下降间隔（201-300），而且速度也不是很快。使用我的数据集：

Unit: milliseconds
 expr      min      lq     mean   median       uq      max neval
    x 159.3901 161.621 190.1703 164.4879 168.3116 297.8395    10

我想知道是否有更好更快的方法来进行这种分析？

谢谢

可重现的数据集：

intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),start=c(1,101,201,301),end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),pos=c(50,60,110,130,160,190,350,360),sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))

编辑

与我的真实数据集大小相同的可重现数据集

intervals <- data.frame(chr=paste0("chr",round(runif(400000,min = 1,max = 22))),start=round(runif(n = 400000,min = 1,max = 100000000)))
intervals$end <- intervals$start+100

positions <- data.frame(chr=paste0("chr",round(runif(30000,min = 1,max = 22))),pos=round(runif(n = 30000,min = 1,max = 100000000)),sample=sample(paste0("sample",1:400),size = 30000,replace=T))

score 2 · Accepted Answer

基于@Jon 所说，data.table 是解决此问题的好方法。使用函数 foverlaps() 可以大大提高速度。

library(data.table)
intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),
                        start=c(1,101,201,301),
                        end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),
                        pos=c(50,60,110,130,160,190,350,360),
                        sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))
setDT(positions)
setDT(intervals)

  positions[, pos_tmp := pos]
  setkey(positions,chr, pos, pos_tmp)
  overlap = foverlaps(intervals, positions, type="any",by.x=c("chr","start", "end")) ## return overlap indices
  overlap[!is.na(sample),.(n_sample = .N), by = .(chr, start, end)]

与@Jon 在我的机器上大约需要 6 秒的实现相比，上述实现需要大约 180 毫秒

score 0 · Accepted Answer

对于大样本，我会尝试使用非 equi join in data.table。样本中的基准比 tmfmnk 的答案差，但对于更大的数据集，它可能会占用更小的内存并且速度更快。

library(data.table)
library(microbenchmark)

intervals <-
  data.table(
    chr = c("chr1", "chr1", "chr1", "chr1"),
    start = c(1, 101, 201, 301),
    end = c(100, 200, 300, 400)
  )
positions <-
  data.table(
    chr = rep("chr1", 8),
    pos = c(50, 60, 110, 130, 160, 190, 350, 360),
    sample = c(
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleC",
      "sampleB",
      "sampleB"
    )
  )


# This takes only the ones we need:
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]


# Benchmarking
microbenchmark(positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
, times = 50)

让我知道它是否有效。我进行了一些更改，以创建更接近您的用例的示例。还包括 Bryan 的基准测试案例。你的方法仍然是最快的。我对第 4 种方法（只有 2 个键）抱有希望，但它仍然较慢。

library(data.table)
library(microbenchmark)
library(GenomicRanges)
set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]

intervals$end <- intervals$start + 100L

# erase duplicates
intervals <-
    unique(intervals[, .SD, .SDcols = c("chr", "start", "end")])

positions <-
    data.table(
        chr = paste0("chr", round(runif(
            30000, min = 1, max = 22
        ))),
        pos = round(runif(
            n = 30000, min = 1, max = 100000000
        )),
        sample = sample(paste0("sample", 1:400), size = 30000, replace = T)
    )

setkeyv(intervals, c("chr", "start", "end"))
positions[, pos2 := pos]
setkeyv(positions, c("chr", "pos", "pos2"))


microbenchmark({
    interval.gr <-
        GRanges(intervals$chr, IRanges(intervals$start, intervals$end))
    positions.gr <-
        GRanges(positions$chr, IRanges(positions$pos, positions$pos))
    ov <- findOverlaps(interval.gr, positions.gr)
    res1 <- intervals %>%
        slice(queryHits(ov)) %>%
        mutate(sample = positions$sample[subjectHits(ov)]) %>%
        group_by(chr, start, end) %>%
        summarise(n_sample = length(unique(sample))) %>% data.table(.)

    intervals_1 <-
        rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     interval.gr <- GRanges(intervals$chr, IRanges(intervals$start,          intervals$end))     positions.gr <- GRanges(positions$chr, IRanges(positions$pos,          positions$pos))     ov <- findOverlaps(interval.gr, positions.gr)     res1 <- intervals %>% slice(queryHits(ov)) %>% mutate(sample = positions$sample[subjectHits(ov)]) %>%          group_by(chr, start, end) %>% summarise(n_sample = length(unique(sample))) %>%          data.table(.)     intervals_1 <- rbind(res1, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  148.0612 164.4884 225.8665 178.5495 219.6038 585.5995    50


microbenchmark({
    # This takes only the ones we need:
    res2 <-
        positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(
            chr = i.chr,
            start = i.start,
            end = i.end,
            sample = x.sample
        )][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
    intervals_2 <-
        rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                             expr
#>  {     res2 <- positions[intervals, on = .(chr == chr, pos <= end,          pos >= start), nomatch = 0L, .(chr = i.chr, start = i.start,          end = i.end, sample = x.sample)][, .(n_sample = length(unique(sample,          na.rm = TRUE))), by = c("chr", "start", "end")]     intervals_2 <- rbind(res2, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  294.4081 338.0309 447.0925 383.2813 548.3734 883.4389    50

microbenchmark({
    # Bryan's approach
    overlap = foverlaps(intervals,
                        positions,
                        type = "any",
                        by.x = c("chr", "start", "end")) ## return overlap indices
    res3 <-
        overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)]
    intervals_3 <-
        rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     overlap = foverlaps(intervals, positions, type = "any", by.x = c("chr",          "start", "end"))     res3 <- overlap[!is.na(sample), .(n_sample = length(unique(sample))),          by = .(chr, start, end)]     intervals_3 <- rbind(res3, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>      min       lq    mean   median       uq      max neval
#>  221.907 280.8795 429.961 398.0075 475.0276 1017.866    50


## Starting by 1

set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]


positions[, start := floor(pos / 100) * 100 + 1L]
positions <-
    unique(positions[, .SD, .SDcols = c("chr", "start", "sample")])
setkeyv(intervals, c("chr", "start"))
setkeyv(positions, c("chr", "start"))
microbenchmark({
    intervals_4 <-
        positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")]#
    # add end
    intervals_4[, end := start + 100L]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                          expr
#>  {     intervals_4 <- positions[intervals][, .(n_sample = sum(!is.na(sample))),          by = c("chr", "start")]     intervals_4[, `:=`(end, start + 100L)] }
#>       min       lq     mean   median       uq      max neval
#>  419.1001 443.9567 546.0186 478.5985 650.4682 1115.179    50

^{由reprex 包（v0.3.0）于 2019 年 9 月 25 日创建}

r - R：计算每个间隔的唯一元素

2 回答 2

Related

Reference