r - 快速查找线性区间重叠的方法

Question

我有一个data.frame线性区间（映射的 RNA-seq 读取的基因组坐标），例如：

df <- data.frame(seqnames = c(rep("chr10",2),rep("chr5",8)),
                 start = c(12255935,12257004,12243635,12244009,12253879,12254395,12254506,12255142,12255229,12258719),
                 end = c(12257002,12258512,12243764,12244291,12254107,12254501,12254515,12255535,12255312,12258764),
                 read_id = c(rep("R9",2),rep("R10",8)),
                 stringsAsFactors = F)

对于某些读取，在同一读取的其他读取中包含或相交的间隔，我想合并它们。在上面的示例中read_id = "R10"，interval:chr5 12255229 12255312包含在 intervalchr5 12255142 12255535中。

对于单次读取data.frame，我使用以下过程：

#defining helper functions
clusterHits <- function(overlap.hits)
{
  overlap.hits <- GenomicRanges::union(overlap.hits,t(overlap.hits))
  query.hits <- S4Vectors::queryHits(overlap.hits)
  search.hits <- S4Vectors::subjectHits(overlap.hits)
  cluster.ids <- seq_len(S4Vectors::queryLength(overlap.hits))
  while(TRUE){
    hit <- S4Vectors::Hits(query.hits,cluster.ids[search.hits],S4Vectors::queryLength(overlap.hits),S4Vectors::subjectLength(overlap.hits))
    tmp.cluster.ids <- pmin(cluster.ids,S4Vectors::selectHits(hit,"first"))
    if(identical(tmp.cluster.ids,cluster.ids))
      break
    cluster.ids <- tmp.cluster.ids
  }
  unname(S4Vectors::splitAsList(seq_len(S4Vectors::queryLength(overlap.hits)),cluster.ids))
}

mergeConnectedRanges <- function(x.gr,overlap.hits)
{
  cluster.ids <- clusterHits(overlap.hits)
  merged.gr <- range(IRanges::extractList(x.gr,cluster.ids))
  merged.gr <- unlist(merged.gr)
  S4Vectors::mcols(merged.gr)$merged.idx <- cluster.ids
  return(merged.gr)
}

#Now separate R10 and merge its intervals
df1 <- dplyr::filter(df, read_id == "R10")
gr <- GenomicRanges::GRanges(dplyr::select(df1,seqnames,start,end))
redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals))

这使：

  seqnames    start      end width strand merged.idx
1     chr5 12243635 12243764   130      *          1
2     chr5 12244009 12244291   283      *          2
3     chr5 12253879 12254107   229      *          3
4     chr5 12254395 12254501   107      *          4
5     chr5 12254506 12254515    10      *          5
6     chr5 12255142 12255535   394      *       6, 7
7     chr5 12258719 12258764    46      *          8

因此merged.idx显示区间 6 和 7 indf1已合并。

我正在寻找一种在数千次读取中快速执行此操作的方法。显而易见的方法是do.call在唯一的读取中使用df：

library(dplyr)
do.call(rbind, lapply(unique(df$read_id), function(r){
  read.df <- dplyr::filter(df, read_id == r)
  gr <- GenomicRanges::GRanges(dplyr::select(read.df,seqnames,start,end))
  redundant.intervals <- GenomicRanges::findOverlaps(gr,ignore.strand=T)
  query.gr <- redundant.intervals[S4Vectors::queryHits(redundant.intervals)]
  subject.gr <- redundant.intervals[S4Vectors::subjectHits(redundant.intervals)]
  as.data.frame(mergeConnectedRanges(x.gr=gr,overlap.hits=redundant.intervals)) %>%
    dplyr::mutate(read_id = r)
}))

但我想知道是否有更快的方法。请注意，实际上具有这种相交间隔的读取比例相对较小。

score 1 · Accepted Answer

使用GenomicRanges来自Bioconductor存储库的包，该任务可以通过几行代码完成：

library(GenomicRanges)
makeGRangesListFromDataFrame(df, split.field = "read_id") |>
  reduce(with.revmap = TRUE) |>
  as.data.frame()

  group group_name seqnames    start      end width strand revmap
1     1        R10     chr5 12243635 12243764   130      *      1
2     1        R10     chr5 12244009 12244291   283      *      2
3     1        R10     chr5 12253879 12254107   229      *      3
4     1        R10     chr5 12254395 12254501   107      *      4
5     1        R10     chr5 12254506 12254515    10      *      5
6     1        R10     chr5 12255142 12255535   394      *   6, 7
7     1        R10     chr5 12258719 12258764    46      *      8
8     2         R9    chr10 12255935 12257002  1068      *      1
9     2         R9    chr10 12257004 12258512  1509      *      2

由于该GenomeRanges软件包不在 CRAN 上，请参阅小插图安装和管理 Bioconductor 软件包或运行

install.packages("BiocManager")
BiocManager::install("GenomicRanges")

数据

df <- data.frame(seqnames = c(rep("chr10", 2), rep("chr5", 8)),
                 start = c(12255935, 12257004, 12243635, 12244009, 12253879, 12254395, 12254506, 12255142, 12255229, 12258719),
                 end   = c(12257002, 12258512, 12243764, 12244291, 12254107, 12254501, 12254515, 12255535, 12255312, 12258764),
                 read_id = c(rep("R9", 2), rep("R10", 8)), 
                 stringsAsFactors = FALSE)

作为

r - 快速查找线性区间重叠的方法

1 回答 1

数据

Related

Reference