2

我在一个文件中有 15 个 fasta 格式的蛋白质序列。我必须对它们进行全局和局部的成对对齐,然后生成一个 15x15 的距离得分矩阵来构建树状图。

但是当我这样做时,即一个序列不与自身对齐,我得到 NA 结果。此外,AxB 给出 12131 分数,但 BxA 给出 NA。因此R不能构建系统发育树。

我应该怎么办?

我将此脚本用于循环,但它仅以一种方式读取:

for (i in 1:150) { 
  globalpwa<-pairwiseAlignment(toString(ProtDF[D[1,i],2]) 
                              ,toString(ProtDF[D[2,i],2]),
                              substitutionMatrix = "BLOSUM62",
                              gapOpening = 0,
                              gapExtension = -2,
                              scoreOnly=FALSE,
                              type="global")
  ScoreX[i]<-c(globalpwa@score)   
  nameSeq1[i]<-c(as.character(ProtDF[D[1,i],1]))
  nameSeq2[i]<-c(as.character(ProtDF[D[2,i],1]))
}
4

2 回答 2

2

我使用了一个示例 fasta 文件,即真菌中 RPS29 的蛋白质序列。

ProtDF <-
c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD", 
XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF", 
TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR", 
TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL", 
TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK", 
TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR", 
TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF", 
TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL", 
`sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR", 
`sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD", 
`sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR", 
TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR", 
`sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR", 
`sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR", 
XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR"
)

所以你正确地使用了combn。正如你所说,你需要一个树状图的距离得分矩阵,所以最好将值存储在矩阵中。见下文,我只是简单地以 fasta 的名称命名矩阵,并插入成对值

library(Biostrings)
# you can also read in your file
# like ProtDF = readAAStringSet("fasta")

ProtDF=AAStringSet(ProtDF)

# combination like you want
# here we just use the names
D = combn(names(ProtDF),2)

#make the pairwise matrix
mat = matrix(NA,ncol=length(ProtDF),nrow=length(ProtDF))
rownames(mat) = names(ProtDF)
colnames(mat) = names(ProtDF)

# loop through D

for(idx in 1:ncol(D)){
       i <- D[1,idx]
       j <- D[2,idx]
       globalpwa<-pairwiseAlignment(ProtDF[i], 
                                    ProtDF[j],
                              substitutionMatrix = "BLOSUM62",
                              gapOpening = 0,
                              gapExtension = -2,
                              scoreOnly=FALSE,
                              type="global")
       mat[i,j]<-globalpwa@score
       mat[j,i]<-globalpwa@score
}

# if you need to make diagonal zero
diag(mat) <- 0

# plot
plot(hclust(as.dist(mat)))

在此处输入图像描述

于 2019-11-14T22:40:39.793 回答
1

另一种方法,如果您有兴趣,使用与上面相同的示例:

library(DECIPHER)

ProtDF <- c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD", 
            XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF", 
            TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR", 
            TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL", 
            TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK", 
            TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR", 
            TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF", 
            TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL", 
            `sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR", 
            `sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD", 
            `sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR", 
            TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR", 
            `sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR", 
            `sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR", 
            XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR")

# All pairwise alignments:

# Convert characters to an AA String Set
ProtDF <- AAStringSet(ProtDF)

# Initialize some outputs
AliMat <- matrix(data = list(),
                 ncol = length(ProtDF),
                 nrow = length(ProtDF))

DistMat <- matrix(data = 0,
                  ncol = length(ProtDF),
                  nrow = length(ProtDF))

# loop through the upper triangle of your matrix
for (m1 in seq_len(length(ProtDF) - 1L)) {
  for (m2 in (m1 + 1L):length(ProtDF)) {
    # Align each pair
    AliMat[[m1, m2]] <- AlignSeqs(myXStringSet = ProtDF[c(m1, m2)],
                                  verbose = FALSE)

    # Assign a distance to each alignment, fill both triangles of the matrix
    DistMat[m1, m2] <- DistMat[m2, m1] <- DistanceMatrix(myXStringSet = AliMat[[m1, m2]],
                                                         type = "dist", # return a single value
                                                         includeTerminalGaps = TRUE, # return a global distance
                                                         verbose = FALSE)
  }
}

dimnames(DistMat) <- list(names(ProtDF),
                          names(ProtDF))

Dend01 <- IdClusters(myDistMatrix = DistMat,
                     method = "NJ",
                     type = "dendrogram",
                     showPlot = FALSE,
                     verbose = FALSE)

# A single multiple alignment:

AllAli <- AlignSeqs(myXStringSet = ProtDF,
                    verbose = FALSE)

AllDist <- DistanceMatrix(myXStringSet = AllAli,
                          type = "matrix",
                          verbose = FALSE,
                          includeTerminalGaps = TRUE)

Dend02 <- IdClusters(myDistMatrix = AllDist,
                     method = "NJ",
                     type = "dendrogram",
                     showPlot = FALSE,
                     verbose = FALSE)

Dend01,来自所有成对对齐:

来自成对比对的树状图

Dend02,来自单个多重对齐:

来自单一多重对齐的树状图

最后,DECIPHER 有一个功能可以在浏览器中加载对齐来查看它,如果您的对齐很大,可能会有点错误,但在这种情况下(并且在这种情况下最多有几百个短序列)就好了:

BrowseSeqs(AllAli)

多重对齐

关于 BrowseSeqs 的附注,由于某种原因,它在 Safari 上的表现不佳,但在 Chrome 上表现得很好。序列按照它们在对齐的字符串集中存在的顺序显示。

编辑:BrowseSeqs 确实可以直接与 safari 配合使用,但它确实存在一个奇怪的问题,即与与 RMarkdown 一起编织的 HTML 合并。很奇怪,但在这里不适用。

另一个重要的方面:我使用的所有函数都有一个processors参数,默认情况下设置为 1,如果你想对你的核心变得贪婪,你可以将它设置为 NULL,它只会抓住所有可用的东西。如果你正在对齐非常大的字符串集,这可能非常有用,如果你正在做像这个例子这样微不足道的小事情,不是那么多。

combn 是一个很棒的功能,我一直在使用它。然而对于这些非常简单的设置,我喜欢循环通过上三角形,但这只是个人喜好。

于 2019-11-20T15:08:08.703 回答