这肯定很长(考虑到expand.grid.df,在大型data.frames上可能效率很低,但是,我希望它能给你一个起点。作为警告,我没有基因组学背景(我敢肯定通过)所以不知道通用包。当然这些是最好的方法。我只是认为尝试解决方案会很有趣。
s<-"sample chr start end
NE001 1 100 200
NE001 2 100 200
NE002 1 50 150
NE002 2 50 150
NE003 2 250 300"
dat<-read.table(text=s, header=T)
library(plyr)
between<-function(x,y,z) x<=y & y<=z
dat$id<-seq_along(dat[,1])
expand.grid.df <- function(...) Reduce(function(...) merge(..., by=NULL), list(...))
expdat<-ddply(dat, .(chr), function(x) expand.grid.df(x,x))
expdat<-subset(expdat, id.x!=id.y)
expdat$betweenL<-with(expdat, between(start.y, start.x, end.y))
expdat$betweenR<-with(expdat, between(start.x, start.y, end.x))
expdat<-subset(expdat, betweenL | betweenR)
expdat$commonstart<-with(expdat,ifelse(betweenL,start.x,start.y))
expdat$commonend<-with(expdat, ifelse(betweenL, end.y, end.x))
res<-ddply(expdat, .(chr, commonstart, commonend),summarize, freq=length(sample.x))
> res
chr commonstart commonend freq
1 1 100 150 2
2 2 100 150 2