python - 使用python在制表符分隔文件中查找字母数字条目

Question

我正在尝试使用带有 Blast2GO 注释的 SIMAP 数据库执行 GOs 注释。一切都很好，但是当我尝试在条目号与其 GO 相关联的文件中查找入藏号时遇到问题。问题是脚本没有在输入文件中找到真正存在的数字。我尝试了一些没有好的结果（re.match、插入列表然后提取元素等） GO 与条目号相关联的文件具有以下结构（入藏号、GO 术语、blats2go 分数）：

1f0ba1d119f52ff28e907d2b5ea450db 去：0007154 79

1f0ba1d119f52ff28e907d2b5ea450db 去：0005605 99

蟒蛇代码：

import re
from Bio.Blast import NCBIXML
from Bio import SeqIO

input_file = open('/home/fpiston/Desktop/test_go/test2.fasta', 'rU')
result_handle = open('/home/fpiston/Desktop/test_go/test2.xml', 'rU')
save_file = open('/home/fpiston/Desktop/test_go/test2.out', 'w')

fh = open('/home/fpiston/Desktop/test_go/Os_Bd_Ta_blat2go_fake', 'rU')
q_dict =  SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
blast_records = NCBIXML.parse(result_handle)

hits = []

for blast_record in blast_records:
    if blast_record.alignments:
        list = (blast_record.query).split()
        if re.match('ENA|\w*|\w*', list[0]) != None:
            list2 = list[0].split("|")
            save_file.write('%s\t' % list2[1])
        else:
            save_file.write('%s\t' % list[0])
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                h = alignment.hit_def    
                for l in fh:             
                    ls = l.split()       #at this point all right
                    if h in ls:          #here, 'h' in not found in 'fh'
                        print h
                        print 'ok'
                        save_file.write('%s\t' % ls[1])
                save_file.write('\n')
        hits.append(blast_record.query.split()[0])
misses =set(q_dict.keys()) - set(hits)

for i in misses:
    list = i.split("|")
    if len(list) > 1:
        save_file.write('%s\t' % list[1])
    else:
        save_file.write('%s\t' % list)
    save_file.write('%s\n' % 'no_match')

save_file.close()

This is the code with the correction of martineau (fh.seek(0)):

#!/usr/bin/env python
import sys
import re
from Bio.Blast import NCBIXML
from Bio import SeqIO

input_file = sys.argv[1] #queries sequences in fasta format
out_blast_file = sys.argv[2] #name of the blast results file
output_file = sys.argv[3] #name of the output file

result_handle = open(out_blast_file, 'rU')
fh = open('/home/fpiston/Desktop/test_go/Os_Bd_Ta_blat2go', 'rU')
q_dict =  SeqIO.to_dict(SeqIO.parse(open(input_file), "fasta"))
blast_records = NCBIXML.parse(result_handle)
save_file = open(output_file, 'w')
hits = []

for blast_record in blast_records:
    if blast_record.alignments:
        list = (blast_record.query).split()
        if re.match('ENA|\w*|\w*', list[0]) != None:
            list2 = list[0].split("|")
            save_file.write('\n%s\t' % list2[1])
        else:
            save_file.write('\n%s\t' % list[0])
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                hit = alignment.hit_def
                save_file.write('%s\t' % hit)
                fh.seek(0)
                for l in fh:
                    ls = l.split()
                    if ls[0] in  hit:
                        save_file.write('%s\t' % ls[1])          
        hits.append(blast_record.query.split()[0])

misses =set(q_dict.keys()) - set(hits)

for i in misses:
    list = i.split("|")
    if len(list) > 1:
        save_file.write('\n%s\t' % list[1])
    else:
        save_file.write('\n%s\t' % list)
    save_file.write('%s' % 'no_match')

save_file.close()

score 0 · Accepted Answer

I really have no idea what you're talking about here, but noticed that within the outer for blast_record in blast_records: and for alignment in blast_record.alignments: loops you have a for l in fh: but never rewind the file with a fh.seek(0) anywhere, which means it only reads the lines in the file the first time it's executed -- which seems illogical.

You could fix this by adding the fh.seek(0) just before the inner loop. Although unnecessary the very first time the inner loop executes, it's need all the following times and doing it one extra time won't hurt anything.

python - 使用python在制表符分隔文件中查找字母数字条目

1 回答 1

Related

Reference