python - 提高 python 脚本的性能以过滤一对 FASTQ 文件中的未配对读取

Question

我有一些配对末端测序数据。但是，文件未正确配对。需要删除未配对的读取以使读取对文件保持一致。尽管有使用 Trimmomatic 之类的解决方案。我想要关于如何提高我自己的脚本性能的建议。当前版本每秒处理大约 10K 条记录。

import sys


class FastqRecord(object):
    def __init__(self, block, unique_col=1, unique_type=int, sep=' '):
        self.block = "".join(block[:])
        self.header = block[0][1:].rstrip('\n')
        self.uid = unique_type(self.header.split(sep)[unique_col])

    def __eq__(self, other):
        return self.uid == other.uid

    def __gt__(self, other):
        return self.uid > other.uid

class Fastq(object):
    def __init__(self, filename):
        self.file = filename
        self.handle = open(self.file)
        self.count = 0
        self.record = self.next()

    def __iter__(self):
        return self

    def next(self):
        return FastqRecord([self.handle.next(), self.handle.next(), self.handle.next(), self.handle.next()])

    def update(self):
        try:
            self.record = self.next()
            self.count+=1
        except StopIteration:
            self.record = False
            self.handle.close()

def fn_from_path(path):
    return path.split('/')[-1].split('.')[0]

def flush_handle(obj, handle):
    handle.write(obj.record.block)
    for x in obj:
        handle.write(x.block)
    return True

@profile
def main():
    file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
    #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
    out_handles = {
        'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
        'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
        's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
    }
    f1, f2 = Fastq(file1), Fastq(file2)
    while True:
        print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
        sys.stdout.flush()
        if f1.record is False or f2.record is False:
            if f1.record is False:
                flush_handle(f2, out_handles['s'])
            else:
                flush_handle(f1, out_handles['s'])
            break
        if f1.record == f2.record:
            out_handles['p1'].write(f1.record.block)
            out_handles['p2'].write(f2.record.block)
            f1.update()
            f2.update()
        elif f1.record > f2.record:
            out_handles['s'].write(f2.record.block)
            f2.update()
        else:
            out_handles['s'].write(f1.record.block)
            f1.update()
    for i in out_handles:
        out_handles[i].close()
    print "\n Done!"

if __name__ == "__main__":
    main()

这是我的 FASTQ 文件的外观：

head SRR2130002_1.fasta

@SRR2130002.1 1 length=39
CCCTAACCCTAACCCTAACCCTAACCCAAAGACAGGCAA
+SRR2130002.1 1 length=39
AAAAA7F<<FF<F.<FFFFF77F.))))FA.))))))7A
@SRR2130002.2 2 length=39
TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTATCTCGTA
+SRR2130002.2 2 length=39
<AAAAAA.7FFFFFFFFFF.<...).A.F7F7)A.FAA<
@SRR2130002.3 3 length=39
CTACCCCTAACCCTAACCCTAACAAAACCCCAAAACAAA

谢谢

更新：我在 main() 函数上运行了一个 line profiler (kernprof)。并得到以下统计数据：

定时器单位：1e-06 s

总时间：0.808629 s 文件：paired_extractor.py 功能：第 46 行的 main

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    46                                           @profile
    47                                           def main():
    48         1            4      4.0      0.0      file1, file2, out_dir = (sys.argv[1], sys.argv[2], sys.argv[3].rstrip('/'))
    49                                               #file1, file2, out_dir = ('./test/SRR2130002_1.fastq', './test/SRR2130002_2.fastq', '.')
    50         1            1      1.0      0.0      out_handles = {
    51         1         4830   4830.0      0.6          'p1': open('%s/%s.fastq' % (out_dir, fn_from_path(file1)), 'w'),
    52         1         4118   4118.0      0.5          'p2': open('%s/%s.fastq' % (out_dir, fn_from_path(file2)), 'w'),
    53         1         1283   1283.0      0.2          's': open('%s/%s_unpaired.fastq' % (out_dir, fn_from_path(file1).split('_')[0]), 'w')
    54                                               }
    55         1         2945   2945.0      0.4      f1, f2 = Fastq(file1), Fastq(file2)
    56     25001        23354      0.9      2.9      while True:
    57     25001       105393      4.2     13.0          print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
    58     25001        68264      2.7      8.4          sys.stdout.flush()
    59     25001        29858      1.2      3.7          if f1.record is False or f2.record is False:
    60         1            1      1.0      0.0              if f1.record is False:
    61         1         9578   9578.0      1.2                  flush_handle(f2, out_handles['s'])
    62                                                       else:
    63                                                           flush_handle(f1, out_handles['s'])
    64         1            1      1.0      0.0              break
    65     25000        47062      1.9      5.8          if f1.record == f2.record:
    66     23593        41546      1.8      5.1              out_handles['p1'].write(f1.record.block)
    67     23593        34040      1.4      4.2              out_handles['p2'].write(f2.record.block)
    68     23593       216319      9.2     26.8              f1.update()
    69     23593       196077      8.3     24.2              f2.update()
    70      1407         2340      1.7      0.3          elif f1.record > f2.record:
    71                                                       out_handles['s'].write(f2.record.block)
    72                                                       f2.update()
    73                                                   else:
    74      1407         2341      1.7      0.3              out_handles['s'].write(f1.record.block)
    75      1407        13231      9.4      1.6              f1.update()
    76         4            9      2.2      0.0      for i in out_handles:
    77         3         6023   2007.7      0.7          out_handles[i].close()
    78         1           11     11.0      0.0      print "\n Done!"

score 0 · Accepted Answer

让我提醒我的回应是，如果不使用分析器进行尽职调查，通常很难/不可能优化代码（这是我即将做的，因为我没有你的 fasta 文件）。你似乎也有一些使用字符串的好习惯（比如使用join）。看起来你不需要

print "\r%9d %9d %9d" % (f1.count, f2.count, f1.count-f2.count),
sys.stdout.flush()

因为这只是将统计信息输出到控制台。通过删除它，你会得到一点加速......但没有什么重大的事情发生在我身上。

就个人观点而言，如果您想要一般的处理速度，我建议使用 C/C++。他们没有解释器的开销。Python 解释器是一个了不起的工具，但它并不是为了速度而构建的。它是为正确性/安全性/易用性而构建的。特别是对于字符串操作，Perl 是我所听到的，尽管我承认我一生中只写了 10 行 Perl。

python - 提高 python 脚本的性能以过滤一对 FASTQ 文件中的未配对读取

1 回答 1

Related

Reference