2

假设我有一个不重叠的基因组区间列表。

chr1    1   100
chr1    101 200
chr1    201 300
chr1    301 400

以及与不同样本相关的基因组位置列表:

chr1    50  sampleA
chr1    60  sampleB
chr1    110 sampleA
chr1    130 sampleB
chr1    160 sampleA
chr1    190 sampleC
chr1    350 sampleB
chr1    360 sampleB

我的目标是计算每个间隔的唯一样本数。在我的真实数据集中,间隔表是 ~400.000 行,基因组位置样本表是 ~30.000 行。

该计算嵌入在模拟中,因此它应该尽可能快。我已经尝试过使用 GenomicRanges 作为:

require(GenomicRanges)
interval.gr <- GRanges(intervals$chr,IRanges(intervals$start,intervals$end))
positions.gr <- GRanges(positions$chr,IRanges(positions$pos,positions$pos))
ov <- findOverlaps(interval.gr,positions.gr)
intervals %>%
  slice(queryHits(ov)) %>%
  mutate(sample=positions$sample[subjectHits(ov)]) %>% 
  group_by(chr,start,end) %>% 
  summarise(n_sample=length(unique(sample)))

结果是

# A tibble: 3 x 4
# Groups:   chr, start [3]
  chr   start   end n_sample
  <fct> <dbl> <dbl>    <int>
1 chr1      1   100        2
2 chr1    101   200        3
3 chr1    301   400        1

然而,它仍然会在没有样本的情况下下降间隔(201-300),而且速度也不是很快。使用我的数据集:

Unit: milliseconds
 expr      min      lq     mean   median       uq      max neval
    x 159.3901 161.621 190.1703 164.4879 168.3116 297.8395    10

我想知道是否有更好更快的方法来进行这种分析?

谢谢


可重现的数据集:

intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),start=c(1,101,201,301),end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),pos=c(50,60,110,130,160,190,350,360),sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))

编辑

与我的真实数据集大小相同的可重现数据集

intervals <- data.frame(chr=paste0("chr",round(runif(400000,min = 1,max = 22))),start=round(runif(n = 400000,min = 1,max = 100000000)))
intervals$end <- intervals$start+100

positions <- data.frame(chr=paste0("chr",round(runif(30000,min = 1,max = 22))),pos=round(runif(n = 30000,min = 1,max = 100000000)),sample=sample(paste0("sample",1:400),size = 30000,replace=T))
4

2 回答 2

2

基于@Jon 所说,data.table 是解决此问题的好方法。使用函数 foverlaps() 可以大大提高速度。

library(data.table)
intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),
                        start=c(1,101,201,301),
                        end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),
                        pos=c(50,60,110,130,160,190,350,360),
                        sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))
setDT(positions)
setDT(intervals)

  positions[, pos_tmp := pos]
  setkey(positions,chr, pos, pos_tmp)
  overlap = foverlaps(intervals, positions, type="any",by.x=c("chr","start", "end")) ## return overlap indices
  overlap[!is.na(sample),.(n_sample = .N), by = .(chr, start, end)]

与@Jon 在我的机器上大约需要 6 秒的实现相比,上述实现需要大约 180 毫秒

于 2019-09-25T13:06:11.033 回答
0

对于大样本,我会尝试使用非 equi join in data.table。样本中的基准比 tmfmnk 的答案差,但对于更大的数据集,它可能会占用更小的内存并且速度更快。

library(data.table)
library(microbenchmark)

intervals <-
  data.table(
    chr = c("chr1", "chr1", "chr1", "chr1"),
    start = c(1, 101, 201, 301),
    end = c(100, 200, 300, 400)
  )
positions <-
  data.table(
    chr = rep("chr1", 8),
    pos = c(50, 60, 110, 130, 160, 190, 350, 360),
    sample = c(
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleC",
      "sampleB",
      "sampleB"
    )
  )


# This takes only the ones we need:
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]


# Benchmarking
microbenchmark(positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
, times = 50)

让我知道它是否有效。我进行了一些更改,以创建更接近您的用例的示例。还包括 Bryan 的基准测试案例。你的方法仍然是最快的。我对第 4 种方法(只有 2 个键)抱有希望,但它仍然较慢。

library(data.table)
library(microbenchmark)
library(GenomicRanges)
set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]

intervals$end <- intervals$start + 100L

# erase duplicates
intervals <-
    unique(intervals[, .SD, .SDcols = c("chr", "start", "end")])

positions <-
    data.table(
        chr = paste0("chr", round(runif(
            30000, min = 1, max = 22
        ))),
        pos = round(runif(
            n = 30000, min = 1, max = 100000000
        )),
        sample = sample(paste0("sample", 1:400), size = 30000, replace = T)
    )

setkeyv(intervals, c("chr", "start", "end"))
positions[, pos2 := pos]
setkeyv(positions, c("chr", "pos", "pos2"))


microbenchmark({
    interval.gr <-
        GRanges(intervals$chr, IRanges(intervals$start, intervals$end))
    positions.gr <-
        GRanges(positions$chr, IRanges(positions$pos, positions$pos))
    ov <- findOverlaps(interval.gr, positions.gr)
    res1 <- intervals %>%
        slice(queryHits(ov)) %>%
        mutate(sample = positions$sample[subjectHits(ov)]) %>%
        group_by(chr, start, end) %>%
        summarise(n_sample = length(unique(sample))) %>% data.table(.)

    intervals_1 <-
        rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     interval.gr <- GRanges(intervals$chr, IRanges(intervals$start,          intervals$end))     positions.gr <- GRanges(positions$chr, IRanges(positions$pos,          positions$pos))     ov <- findOverlaps(interval.gr, positions.gr)     res1 <- intervals %>% slice(queryHits(ov)) %>% mutate(sample = positions$sample[subjectHits(ov)]) %>%          group_by(chr, start, end) %>% summarise(n_sample = length(unique(sample))) %>%          data.table(.)     intervals_1 <- rbind(res1, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  148.0612 164.4884 225.8665 178.5495 219.6038 585.5995    50


microbenchmark({
    # This takes only the ones we need:
    res2 <-
        positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(
            chr = i.chr,
            start = i.start,
            end = i.end,
            sample = x.sample
        )][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
    intervals_2 <-
        rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                             expr
#>  {     res2 <- positions[intervals, on = .(chr == chr, pos <= end,          pos >= start), nomatch = 0L, .(chr = i.chr, start = i.start,          end = i.end, sample = x.sample)][, .(n_sample = length(unique(sample,          na.rm = TRUE))), by = c("chr", "start", "end")]     intervals_2 <- rbind(res2, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  294.4081 338.0309 447.0925 383.2813 548.3734 883.4389    50

microbenchmark({
    # Bryan's approach
    overlap = foverlaps(intervals,
                        positions,
                        type = "any",
                        by.x = c("chr", "start", "end")) ## return overlap indices
    res3 <-
        overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)]
    intervals_3 <-
        rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     overlap = foverlaps(intervals, positions, type = "any", by.x = c("chr",          "start", "end"))     res3 <- overlap[!is.na(sample), .(n_sample = length(unique(sample))),          by = .(chr, start, end)]     intervals_3 <- rbind(res3, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>      min       lq    mean   median       uq      max neval
#>  221.907 280.8795 429.961 398.0075 475.0276 1017.866    50


## Starting by 1

set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]


positions[, start := floor(pos / 100) * 100 + 1L]
positions <-
    unique(positions[, .SD, .SDcols = c("chr", "start", "sample")])
setkeyv(intervals, c("chr", "start"))
setkeyv(positions, c("chr", "start"))
microbenchmark({
    intervals_4 <-
        positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")]#
    # add end
    intervals_4[, end := start + 100L]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                          expr
#>  {     intervals_4 <- positions[intervals][, .(n_sample = sum(!is.na(sample))),          by = c("chr", "start")]     intervals_4[, `:=`(end, start + 100L)] }
#>       min       lq     mean   median       uq      max neval
#>  419.1001 443.9567 546.0186 478.5985 650.4682 1115.179    50

reprex 包(v0.3.0)于 2019 年 9 月 25 日创建

于 2019-09-25T11:44:26.093 回答