我想我会尝试使用 Biopython 来挽救合作者提供的一些容易损坏的 fastq 文件。@
我只需要修改包含某个子字符串的标题行(以 开头)。但是,以下代码创建的新 fastq 文件并没有改变。毫无疑问,我遗漏了一些明显的东西。
编写修改后的 fastq SeqRecord 的正确方法是什么?
import os, sys
from Bio import SeqIO
path_to_reads = sys.argv[1]
if not os.path.exists(path_to_reads + '/fixed'):
os.mkdir(path_to_reads + '/fixed')
fwd_fastqs = [fn for fn in os.listdir(path_to_reads) if fn.endswith('_F.fastq')]
rev_fastqs = [fn for fn in os.listdir(path_to_reads) if fn.endswith('_R.fastq')]
fastq_pairs = zip(fwd_fastqs, rev_fastqs)
for fastq_pair in fastq_pairs:
with open(path_to_reads + '/' + fastq_pair[0], 'rU') as fwd_fastq:
with open(path_to_reads + '/fixed/' + fastq_pair[0], 'w') as fixed_fwd_fastq:
fixed_fwd_records = []
for fwd_record in SeqIO.parse(fwd_fastq, 'fastq'):
fwd_record.name = fwd_record.name.replace('/2','/1')
fixed_fwd_records.append(fwd_record)
SeqIO.write(fixed_fwd_records, fixed_fwd_fastq, 'fastq')
# ...
输入数据(两条记录,标题行以 开头@
):
@MISEQ01:115:000000000-A8FBM:1:1112:18038:15085/1
GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGCCAATCATCTCGTATGCCGTCTTCTGCTTG
+
AAAAAAAAAF4CGGGGGAGGGFGHBHGHC5AGEGFFHGA3F355FGG223FFAEE0GCGA55BAB
@MISEQ01:115:000000000-A8FBM:1:1101:20590:9966/2
GATCACTCCCCTGTGAGGAACTACTGTCTTCACGCAGAAAGCGTCTAGCCATGGCGTTAGTATGA
+
1>A111DFBA1CFA1FFG1BFGB1D1DGFGH3GECA0ABFFG?E///DDGFBB0FEAEEFBDAB2