我在目录中有一堆 fastq 文件,我想将序列修剪 2 个核苷酸和质量(如果读取有 51 个碱基对并且以 CTG 或 TTG 结尾)。
这是我写的 shell 脚本,但我遇到了一些错误,需要帮助,因为我是 shell 脚本的新手
输入:
@HWI-ST1072:187:C35YUACXX:7:1101:1609:1983 1:N:0:ACAGTG
NGGAGAAAGAGAGTGTGTTTTTAGGGGGAGATTTTTAAAATGGTTGTTTTG
+
#0<BFFFFFFFFF<BFFFIIFFFFFIIIBFFFFFIIFIIIIIFFBFFFFFF
@HWI-ST1072:187:C35YUACXX:7:1101:1747:1995 1:N:0:ACAGTG
NGGTTGTGGTGGTGGGTATTTGTAGTTTTATTTATTCGGGAGGTTGAGCTG
+
#0<BFFFFFFFFFFIIBFFIIIIIIFIIIFFIIFIIIFIIFIIFFFFIIFF
@HWI-ST1072:187:C35YUACXX:7:1101:9351:2210 1:N:0:ACAGTG
CGGTTTTGTTTTATTTTGTATGATTAGGAGGGTTTTGGAGGTTTAGTTACC
+
BBBFFFFFFFFFFIIIIIFFIIFIIIIIIIIIFFIIFIFIIFFIIIFIIII
@HWI-ST1072:187:C35YUACXX:7:1101:1747:1995 1:N:0:ACAGTG
NGGTTGTGGTGGTGGGTATTTGTAGTTTTATTTAT
+
#0<BFFFFFFFFFFIIBFFIIIIIIFIIIFFIIFI
输出:
@HWI-ST1072:187:C35YUACXX:7:1101:1609:1983 1:N:0:ACAGTG
NGGAGAAAGAGAGTGTGTTTTTAGGGGGAGATTTTTAAAATGGTTGTTT
+
#0<BFFFFFFFFF<BFFFIIFFFFFIIIBFFFFFIIFIIIIIFFBFFFF
@HWI-ST1072:187:C35YUACXX:7:1101:1747:1995 1:N:0:ACAGTG
NGGTTGTGGTGGTGGGTATTTGTAGTTTTATTTATTCGGGAGGTTGAGC
+
#0<BFFFFFFFFFFIIBFFIIIIIIFIIIFFIIFIIIFIIFIIFFFFII
@HWI-ST1072:187:C35YUACXX:7:1101:9351:2210 1:N:0:ACAGTG
CGGTTTTGTTTTATTTTGTATGATTAGGAGGGTTTTGGAGGTTTAGTTACC
+
BBBFFFFFFFFFFIIIIIFFIIFIIIIIIIIIFFIIFIFIIFFIIIFIIII
@HWI-ST1072:187:C35YUACXX:7:1101:1747:1995 1:N:0:ACAGTG
NGGTTGTGGTGGTGGGTATTTGTAGTTTTATTTAT
+
#0<BFFFFFFFFFFIIBFFIIIIIIFIIIFFIIFI
脚本:
for sample in *.fastq;do
name=$(echo ${sample} | sed 's/.fastq//')
while read line;do
if [ ${line:0:1} == "@" ] ; then
head="${line}"
$echo $head
elif [ "${head}" ] && [ "${line}" ] ; then
length=${#line}
if [ "${length}" = 51 -a "${line}" =~ *CTG|*TTG ] ; then
sequence= substr($line,0,49)
#echo $sequence
fi
elif [ ${line:0:1} == "+" ] ; then
plus="${line}"
#echo $plus
elif [ "${plus}" ] && [ "${line}" ] ; then
quality= substr($line,0,49)
#echo $quality
fi
print "${head}\n${sequence}\n${plus}\n${quality}" > ${name}_new.fq
done < $sample
done