python - 我如何决定在 Smith-Waterman 算法中回溯的方式？

Question

我正在尝试使用Smith–Waterman 算法在 Python 中实现局部序列比对。

这是我到目前为止所拥有的。它可以建立相似度矩阵：

import sys, string
from numpy import *

f1=open(sys.argv[1], 'r')
seq1=f1.readline()
f1.close()
seq1=string.strip(seq1)

f2=open(sys.argv[2], 'r')
seq2=f2.readline()
f2.close()
seq2=string.strip(seq2)

a,b =len(seq1),len(seq2)

penalty=-1;
point=2;

#generation of matrix for local alignment
p=zeros((a+1,b+1))

# table calculation and matrix generation
for i in range(1,a+1):
    for j in range(1,b+1):
        vertical_score =p[i-1][j]+penalty;
        horizontal_score= p[i][j-1]+penalty;
        if seq1[i-1]==seq2[j-1]:
            diagonal_score =p[i-1][j-1]+point;
        else:
            diagonal_score = p[i-1][j-1]+penalty;
        p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);

print p

例如，使用两个序列：

agcacact
acacacta

我的代码输出相似度矩阵：

[[  0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   2.   1.   2.   1.   2.   1.   0.   2.]
 [  0.   1.   1.   1.   1.   1.   1.   0.   1.]
 [  0.   0.   3.   2.   3.   2.   3.   2.   1.]
 [  0.   2.   2.   5.   4.   5.   4.   3.   4.]
 [  0.   1.   4.   4.   7.   6.   7.   6.   5.]
 [  0.   2.   3.   6.   6.   9.   8.   7.   8.]
 [  0.   1.   4.   5.   8.   8.  11.  10.   9.]
 [  0.   0.   3.   4.   7.   7.  10.  13.  12.]]

现在我被困在算法的下一步，即回溯以构建最佳对齐方式。

维基百科说，

为了获得最佳的局部对齐，我们从矩阵中的最大值 ( i , j ) 开始。然后，根据用于构造矩阵的移动方向，我们返回到位置 ( i -1, j )、( i , j -1) 和 ( i -1, j -1) 之一。我们保持这个过程，直到我们到达一个零值的矩阵单元，或者位置 (0, 0) 的值。

我无法确定要回溯到哪个位置。Wikipedia 所说的“取决于用于构造矩阵的运动方向”是什么意思，我将如何在 Python 中实现这一点？

最后我做到了

import sys, string
from numpy import*
import re

# read first sequence

fasta_sequence1=open(sys.argv[1], 'r')

seq1=""
for i in fasta_sequence1:
    if i.startswith(">"):
        pass
    else:
        seq1 = seq1 + i.strip()   
fasta_sequence1.close()

fasta_sequence2=open(sys.argv[2], 'r')

seq2 = ""
for i in fasta_sequence2:
    if i.startswith('>'):
        pass
    else:
        seq2 = seq2+ i.strip()
fasta_sequence2.close()

a,b =len(seq1),len(seq2)


penalty=-1;
point=2;

#generation of matrix for local alignment 

p=zeros((a+1,b+1))

#intialization of max score
max_score=0;
#pointer to store the traceback path

pointer=zeros((a+1,b+1))

# table calculation and matrix generation
for i in range(1,a+1):
    for j in range(1,b+1):
        vertical_score =p[i-1][j]+penalty;
        horizontal_score= p[i][j-1]+penalty;
        if seq1[i-1]==seq2[j-1]:
            diagonal_score =p[i-1][j-1]+point;
        else:
            diagonal_score = p[i-1][j-1]+penalty;

for i in range(1,a+1):
    for j in range(1,b+1):            
        p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);

for i in range(1,a+1):
    for j in range(1,b+1):
        if p[i][j]==0:
            pointer[i][j]=0; #0 means end of the path
        if p[i][j]==vertical_score:
            pointer[i][j]=1; #1 means trace up
        if p[i][j]==horizontal_score:
            pointer[i][j]=2; #2 means trace left
        if p[i][j]==diagonal_score:
            pointer[i][j]=3; #3 means trace diagonal
        if p[i][j]>=max_score:
            maximum_i=i;
            maximum_j=j;
            max_score=p[i][j];


#for i in range(1,a+1):
   # for j in range(1,b+1):
        #if p[i][j]>max_score:
            #max_score=p[i][j]

print max_score

# traceback of all the pointers 

align1,align2='',''; #initial sequences

i,j=max_i,max_j; #indices of path starting point

while pointer[i][j]!=0:
  if pointer[i][j]==3:
    align1=align1+seq1[i-1];
    align2=align2+seq2[j-1];
    i=i-1;
    j=j-1;
  elif pointer[i][j]==2:
    align1=align1+'-';
    align2=align2+seq2[j-1]
    j=j-1;
  elif pointer[i][j]==1:
    align1=align1+seq1[i-1];
    align2=align2+'-';
    i=i-1;



align1=align1[::-1]; #reverse sequence 1
align2=align2[::-1]; #reverse sequence 2

#output_file = open(sys.argv[3],"w")
#output_file.write(align1)

#output_file.write(align2)

print align1
print align2

但我认为可能有更好、更有效的方法来做到这一点

output_file = open(sys.argv[3],"w")
output_file.write(align1)
output_file.write(align2)

结果显示像

agcacacta-cacact

相反c：

print align1
print align2

显示正确的输出：

agcacact
a-cacact

如何在文件编写器中获得上述输出。谢谢

score 12 · Accepted Answer

当您构建相似度矩阵时，您不仅需要存储相似度得分，还需要存储该得分的来源。您当前有一行代码：

p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);

所以在这里你需要记住的不仅仅是最高分，而是其中哪一个是最高分。然后当你来做回溯时，你就会知道该往哪个方向走。

例如，您可以尝试这样的事情：

import numpy

DELETION, INSERTION, MATCH = range(3)

def smith_waterman(seq1, seq2, insertion_penalty = -1, deletion_penalty = -1,
                   mismatch_penalty = -1, match_score = 2):
    """
    Find the optimum local sequence alignment for the sequences `seq1`
    and `seq2` using the Smith-Waterman algorithm. Optional keyword
    arguments give the gap-scoring scheme:

    `insertion_penalty` penalty for an insertion (default: -1)
    `deletion_penalty`  penalty for a deletion (default: -1)
    `mismatch_penalty`  penalty for a mismatch (default: -1)
    `match_score`       score for a match (default: 2)

    See <http://en.wikipedia.org/wiki/Smith-Waterman_algorithm>.

    >>> for s in smith_waterman('AGCAGACT', 'ACACACTA'): print s
    ... 
    AGCAGACT-
    A-CACACTA
    """
    m, n = len(seq1), len(seq2)

    # Construct the similarity matrix in p[i][j], and remember how
    # we constructed it -- insertion, deletion or (mis)match -- in
    # q[i][j].
    p = numpy.zeros((m + 1, n + 1))
    q = numpy.zeros((m + 1, n + 1))
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            deletion = (p[i - 1][j] + deletion_penalty, DELETION)
            insertion = (p[i][j - 1] + insertion_penalty, INSERTION)
            if seq1[i - 1] == seq2[j - 1]:
                match = (p[i - 1][j - 1] + match_score, MATCH)
            else:
                match = (p[i - 1][j - 1] + mismatch_penalty, MATCH)
            p[i][j], q[i][j] = max((0, 0), deletion, insertion, match)

    # Yield the aligned sequences one character at a time in reverse
    # order.
    def backtrack():
        i, j = m, n
        while i > 0 or j > 0:
            assert i >= 0 and j >= 0
            if q[i][j] == MATCH:
                i -= 1
                j -= 1
                yield seq1[i], seq2[j]
            elif q[i][j] == INSERTION:
                j -= 1
                yield '-', seq2[j]
            elif q[i][j] == DELETION:
                i -= 1
                yield seq1[i], '-'
            else:
                assert(False)

    return [''.join(reversed(s)) for s in zip(*backtrack())]

python - 我如何决定在 Smith-Waterman 算法中回溯的方式？

1 回答 1

Related

Reference