我有一些配对末端测序数据。但是,文件未正确配对。需要删除未配对的读取以使读取对文件保持一致。尽管有使用 Trimmomatic 之类的解决方案。我想要关于如何提高我自己的脚本性能的建议。当前版本每秒处理大约 10K 条记录。
import sys
class FastqRecord(object):
def __init__(self, block, unique_col=1, unique_type=int, sep=' '):
self.block = "".join(block[:])
self.header = block[0][1:].rstrip('\n')
self.uid = unique_type(self.header.split(sep)[unique_col])
def __eq__(self, other):
return self.uid == other.uid
def __gt__(self, other):
return self.uid > other.uid
class Fastq(object):
def __init__(self, filename):
self.file = filename
self.handle = open(self.file)
self.count = 0
self.record = self.next()
def __iter__(self):
return self
def next(self):
return FastqRecord([self.handle.next(), self.handle.next(), self.handle.next(), self.handle.next()])
def update(self):
try:
self.record = self.next()
self.count+=1
except StopIteration:
self.record = False
self.handle.close()
def fn_from_path(path):
return path.split('/')[-1].split('.')[0]
def flush_handle(obj, handle):
handle.write(obj.record.block)
for x in obj:
handle.write(x.block)
return True
@profile
def main():
file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
#file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
out_handles = {
'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
}
f1, f2 = Fastq(file1), Fastq(file2)
while True:
print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
sys.stdout.flush()
if f1.record is False or f2.record is False:
if f1.record is False:
flush_handle(f2, out_handles['s'])
else:
flush_handle(f1, out_handles['s'])
break
if f1.record == f2.record:
out_handles['p1'].write(f1.record.block)
out_handles['p2'].write(f2.record.block)
f1.update()
f2.update()
elif f1.record > f2.record:
out_handles['s'].write(f2.record.block)
f2.update()
else:
out_handles['s'].write(f1.record.block)
f1.update()
for i in out_handles:
out_handles[i].close()
print "\n Done!"
if __name__ == "__main__":
main()
这是我的 FASTQ 文件的外观:
head SRR2130002_1.fasta
@SRR2130002.1 1 length=39
CCCTAACCCTAACCCTAACCCTAACCCAAAGACAGGCAA
+SRR2130002.1 1 length=39
AAAAA7F<<FF<F.<FFFFF77F.))))FA.))))))7A
@SRR2130002.2 2 length=39
TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTATCTCGTA
+SRR2130002.2 2 length=39
<AAAAAA.7FFFFFFFFFF.<...).A.F7F7)A.FAA<
@SRR2130002.3 3 length=39
CTACCCCTAACCCTAACCCTAACAAAACCCCAAAACAAA
谢谢
更新:我在 main() 函数上运行了一个 line profiler (kernprof)。并得到以下统计数据:
定时器单位:1e-06 s
总时间:0.808629 s 文件:paired_extractor.py 功能:第 46 行的 main
Line # Hits Time Per Hit % Time Line Contents
==============================================================
46 @profile
47 def main():
48 1 4 4.0 0.0 file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
49 #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
50 1 1 1.0 0.0 out_handles = {
51 1 4830 4830.0 0.6 'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
52 1 4118 4118.0 0.5 'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
53 1 1283 1283.0 0.2 's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
54 }
55 1 2945 2945.0 0.4 f1, f2 = Fastq(file1), Fastq(file2)
56 25001 23354 0.9 2.9 while True:
57 25001 105393 4.2 13.0 print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
58 25001 68264 2.7 8.4 sys.stdout.flush()
59 25001 29858 1.2 3.7 if f1.record is False or f2.record is False:
60 1 1 1.0 0.0 if f1.record is False:
61 1 9578 9578.0 1.2 flush_handle(f2, out_handles['s'])
62 else:
63 flush_handle(f1, out_handles['s'])
64 1 1 1.0 0.0 break
65 25000 47062 1.9 5.8 if f1.record == f2.record:
66 23593 41546 1.8 5.1 out_handles['p1'].write(f1.record.block)
67 23593 34040 1.4 4.2 out_handles['p2'].write(f2.record.block)
68 23593 216319 9.2 26.8 f1.update()
69 23593 196077 8.3 24.2 f2.update()
70 1407 2340 1.7 0.3 elif f1.record > f2.record:
71 out_handles['s'].write(f2.record.block)
72 f2.update()
73 else:
74 1407 2341 1.7 0.3 out_handles['s'].write(f1.record.block)
75 1407 13231 9.4 1.6 f1.update()
76 4 9 2.2 0.0 for i in out_handles:
77 3 6023 2007.7 0.7 out_handles[i].close()
78 1 11 11.0 0.0 print "\n Done!"