1

我必须数据集:seq1 和 seq2(DNA 序列)。我想做一个数据图,比较两个序列并在两个序列匹配的地方放置一个点。我可以使用 seqinr 的 dotplot 来完成此操作,但我无法在轴上列出序列,以便您查看哪些点匹配。本质上,我想用序列字母替换数字。

有没有办法做到这一点?也许通过ggplot2?

点图

这些是我的序列:

seq1 <- c("G","C","T","A","G","T","C","A","G","A","T","C","T","G","A","C","G","C","T","A")
seq2 <- c("G","A","T","G","G","T","C","A","C","A","T","C","T","G","C","C","G","C")

这就是我生成此图的方式:

dotPlot(seq1, seq2, main = "Dot plot of 2 different sequences
\nwsize = 4, wstep = 1, nmatch = 3", wsize = 4, wstep = 1, nmatch = 3)
4

2 回答 2

1

dotplot 函数的修改版本(又名“快速而肮脏的 hack”)怎么样?首先,将以下代码复制并粘贴到 R 中:

dotplot<-function (seq1, seq2, wsize = 1, wstep = 1, nmatch = 1, cols = c("white", 
    "black"), xlab = deparse(substitute(seq1)), ylab = deparse(substitute(seq2)), type=2,
    ...) {

    cat("This is a modification of the function dotPlot from package seqinr.\n")

    require(seqinr)

    if (nchar(seq1[1]) > 1) 
        stop("seq1 should be provided as a vector of single chars")
    if (nchar(seq2[1]) > 1) 
        stop("seq2 should be provided as a vector of single chars")
    if (wsize < 1) 
        stop("non allowed value for wsize")
    if (wstep < 1) 
        stop("non allowed value for wstep")
    if (nmatch < 1) 
        stop("non allowed value for nmatch")
    if (nmatch > wsize) 
        stop("nmatch > wsize is not allowed")
    mkwin <- function(seq, wsize, wstep) {
        sapply(seq(from = 1, to = length(seq) - wsize + 1, by = wstep), 
            function(i) c2s(seq[i:(i + wsize - 1)]))
    }
    wseq1 <- mkwin(seq1, wsize, wstep)
    wseq2 <- mkwin(seq2, wsize, wstep)
    if (nmatch == wsize) {
        xy <- outer(wseq1, wseq2, "==")
    }
    else {
        "%==%" <- function(x, y) colSums(sapply(x, s2c) == sapply(y, 
            s2c)) >= nmatch
        xy <- outer(wseq1, wseq2, "%==%")
    }

    if(type==1) {
       image(x = seq(from = 1, to = length(seq1), length = length(wseq1)), 
           y = seq(from = 1, to = length(seq2), length = length(wseq2)), 
           z = xy, col = col, xlab = xlab, ylab = ylab, axes=F, ...)
       box()
    }

    colnames(xy)<-wseq2
    rownames(xy)<-wseq1

    xy2<-matrix(nrow=length(seq1), ncol=length(seq2), data=FALSE)
    rownames(xy2)<-seq1
    colnames(xy2)<-seq2
    ind<-which(xy, arr.ind=T)
    xy2[ind]<-TRUE
    ind<-data.frame(ind, row.names=NULL)    

    res<-data.frame(row=c(), col=c())
    for(i in 1:nrow(ind)) {
       DF<-data.frame(row=seq(from=ind$row[i], to=ind$row[i]+wsize-1),
                      col=seq(from=ind$col[i], to=ind$col[i]+wsize-1))
       res<-rbind(res, DF)
    }
    xy2[as.matrix(res)]<-TRUE

    if(type==2) {
       image(x = seq(from = 1, to = length(seq1)), y = seq(from = 1, to = length(seq2)), z = xy2, col = cols, xlab = xlab, ylab = ylab, axes=F, ...)
       box()
       axis(side=1, at=1:length(seq1), labels=seq1)
       axis(side=2, at=1:length(seq2), labels=seq2, las=1)
    }

    out<-list(type1=xy, type2=xy2)
    return(out)
}

然后运行您提供的示例:

seq1 <- c("G","C","T","A","G","T","C","A","G","A","T","C","T","G","A","C","G","C","T","A")
seq2 <- c("G","A","T","G","G","T","C","A","C","A","T","C","T","G","C","C","G","C")
xy<-dotplot(seq1, seq2, wsize = 4, wstep = 1, nmatch = 3)

应该产生以下情节:

在此处输入图像描述

简而言之,我稍微扩展了函数以在两个序列之间产生一个完整的矩阵。如果您检查输出对象xy,您将看到原始的 (type1) 矩阵和扩展的 (type2)。对于非常长的序列,这种修饰效率不高,更不用说轴上的核苷酸/氨基酸标记会相互重叠。您可以使用新参数在 type1 和 type2 之间更改绘图类型type

于 2014-11-10T12:57:57.873 回答
0

作为比较两个序列的替代方法,您可以考虑R 包的seqalign功能。TraMineR

这是一个例子:

library(TraMineR)
seq1 <- c("G","C","T","A","G","T","C","A","G","A","T","C",
        "T","G","A","C","G","C","T","A")
## Filling seq2 with "*" to equalize sequence length
seq2m <- c("G","A","T","G","G","T","C","A","C","A","T","C",
        "T","G","C","C","G","C","*","*")

## defining sequence object interpreting "*"s as mising states
seq <- seqdef(rbind(seq1,seq2m), missing="*")

## Setting all substitution costs as 2
cost <- matrix(rep(2,16),4,4)
diag(cost) <- 0
cost

## Comparing sequences 1 and 2 
sa <- seqalign(seq, 1:2, indel=1, sm=cost)
print(sa)
plot(sa)

两个序列在​​编辑操作方面的比较

该图显示了将一个序列转换为另一个序列的匹配元素 (EQU) 和最小编辑操作(SUB 和 IND)。

于 2014-12-01T17:06:39.373 回答