我需要使用 OQ 标记将读取质量作为附加字段添加到使用 pysam 的 bam 文件中。
其他使用 samtools 等的传统方式会消耗更多时间并创建多个文件。
我尝试使用下面给出的脚本,但最终是无符号字符而不是质量得分字符串!任何帮助表示赞赏。
提前致谢。
例如。输入 bam:
E00577:205:HVF37CCXY:3:2224:32461:53258 99 chr1 10419997 60 151M = 10420034 188 GGCAGTGGCTTCCGCGTGCCCCGTGTGCTGGTGCGGTTCCCATCACGCAGACAGGAAGGGTGTTTGCGCACTCTGATCAACTGGAACCTCTGTATCANGCGGCTGAATTCCCTTTTTCCTTNACTCNATAAAAGCTACATCAGACTGATGN AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJ<F<J-FJJJJJJJJJJJJJJFJJF<JJJFJ<JJJJJJ<JJFJJJJJJJJJJJJJJJJJJJFJJJJ#JJJJJAJJFJJJJJJJJJJJAFJ#FJJJ#JJJJJJJJFJJAJFAF<JJJAJF# MD:Z:97T23T4A23C0 PG:Z:MarkDuplicates RG:Z:HVF37CCXY.3 NM:i:4 AS:i:144 XS:i:107
预期输出 bam:
E00577:205:HVF37CCXY:3:2224:32461:53258 99 chr1 10419997 60 151M = 10420034 188 GGCAGTGGCTTCCGCGTGCCCCGTGTGCTGGTGCGGTTCCCATCACGCAGACAGGAAGGGTGTTTGCGCACTCTGATCAACTGGAACCTCTGTATCANGCGGCTGAATTCCCTTTTTCCTTNACTCNATAAAAGCTACATCAGACTGATGN AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJ<F<J-FJJJJJJJJJJJJJJFJJF<JJJFJ<JJJJJJ<JJFJJJJJJJJJJJJJJJJJJJFJJJJ#JJJJJAJJFJJJJJJJJJJJAFJ#FJJJ#JJJJJJJJFJJAJFAF<JJJAJF# MD:Z:97T23T4A23C0 PG:Z:MarkDuplicates RG:Z:HVF37CCXY.3 NM:i:4 AS:i:144 XS:i:107 OQ:Z:AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJ<F<J-FJJJJJJJJJJJJJJFJJF<JJJFJ<JJJJJJ<JJFJJJJJJJJJJJJJJJJJJJFJJJJ#JJJJJAJJFJJJJJJJJJJJAFJ#FJJJ#JJJJJJJJFJJAJFAF<JJJAJF#
皮萨姆
import os
import argparse
import pysam
parser = argparse.ArgumentParser(description = 'Generate BAM with OQ tag')
parser.add_argument('-i', '--input', required=True, help='Input mark dup BAM File')
parser.add_argument('-o', '--output', required=True, help='Output BAM file with OQ tag')
args = parser.parse_args()
infile_path = os.path.abspath(args.input)
outfile_path = os.path.abspath(args.output)
infile = pysam.AlignmentFile(infile_path, "rb")
outfile = pysam.AlignmentFile(outfile_path, "wb", template=infile)
iter = infile.fetch(until_eof=True)
for read in iter:
read.set_tag("OQ", read.query_qualities, replace=False)
outfile.write(read)
infile.close()
outfile.close()
python GenerateBamWithOQTag.py -i subset.bam -o subset_OQ.bam
E00577:205:HVF37CCXY:3:2224:32461:53258 99 chr1 10419997 60 151M = 10420034 188 GGCAGTGGCTTCCGCGTGCCCCGTGTGCTGGTGCGGTTCCCATCACGCAGACAGGAAGGGTGTTTGCGCACTCTGATCAACTGGAACCTCTGTATCANGCGGCTGAATTCCCTTTTTCCTTNACTCNATAAAAGCTACATCAGACTGATGN AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJ<F<J-FJJJJJJJJJJJJJJFJJF<JJJFJ<JJJJJJ<JJFJJJJJJJJJJJJJJJJJJJFJJJJ#JJJJJAJJFJJJJJJJJJJJAFJ#FJJJ#JJJJJJJJFJJAJFAF<JJJAJF# MD:Z:97T23T4A23C0 PG:Z:MarkDuplicates RG:Z:HVF37CCXY.3 NM:i:4 AS:i:144 XS:i:107 OQ:B:C,32,32,37,37,37,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,27,37,27,41,12,37,41,41,41,41,41,41,41,41,41,41,41,41,41,41,37,41,41,37,27,41,41,41,37,41,27,41,41,41,41,41,41,27,41,41,37,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,37,41,41,41,41,2,41,41,41,41,41,32,41,41,37,41,41,41,41,41,41,41,41,41,41,41,32,37,41,2,37,41,41,41,2,41,41,41,41,41,41,41,41,37,41,41,32,41,37,32,37,27,41,41,41,32,41,37,2