如果我运行:
file1=file1.txt; file2=$(cat file2.txt|sed -e "s/>gi/Query=gi/g"|sed -e "s/_ref_/ ref_/g");IFS='\n';echo $file2| awk 'NR==FNR { _[$2]=$2; f1_line[key] = $4" "$5" "$6" "$7" "$8" "$9" "$10 } NR!=FNR { if(_[$2] != "") print $0" "f1_line[key]}' - $file1
为了解释它作为脚本的作用,用法如下所述,我在脚本中将文件设置为 file1.rasta,因此它需要我的输入:
./run.sh
-------------------------------------------------------------------------------
No variables defined settings files as:
fil1=file1.rasta
file2=file2.rasta
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
One of the following:
file1=file1.rasta
file2=file2.rasta
does not exist!
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
usage:
./run.sh file1.fasta file2.fasta is the same as line below
./run.sh ./file1.fasta ./file2.fasta
-- This is if files are elsewhere
./run.sh /path/to/file1.fasta /path/to/file2.fasta
-------------------------------------------------------------------------------
运行它:
./run.sh ./file1.fasta ./file2.fasta
1 Query=gi_148659820 ref_YP_001281343.1_ Hit=gi_148659820 ref_YP_001281343.1_ hypothetical protei TBFG_10059 [Mycobacterium tuberculosis F11]
2 Query=gi_148659820 ref_YP_001281343.1_ Hit=gi_148821250 ref_YP_001286004.1_ hypothetical protei TBFG_10059 [Mycobacterium tuberculosis F11]
3 Query=gi_148659820 ref_YP_001281343.1_ Hit=gi_15607202 ref_NP_214574.1_ hypothetical protei TBFG_10059 [Mycobacterium tuberculosis F11]
4 Query=gi_148659820 ref_YP_001281343.1_ Hit=gi_253796975 ref_YP_003029976.1_ hypothetical protei TBFG_10059 [Mycobacterium tuberculosis F11]
5 Query=gi_148659820 ref_YP_001281343.1_ Hit=gi_375294260 ref_YP_005098527.1_ hypothetical protei TBFG_10059 [Mycobacterium tuberculosis F11]
bash 脚本 run.sh 是上面的 1 行,但分解为解释:
#!/bin/bash
function line() {
echo -e "-------------------------------------------------------------------------------"
}
function usage() {
line;
echo "usage:"
echo $0 file1.fasta file2.fasta is the same as line below
echo $0 ./file1.fasta ./file2.fasta
echo -- This is if files are elsewhere
echo $0 /path/to/file1.fasta /path/to/file2.fasta
line;
}
file1=$1;
file2=$2;
if [ $# -lt 2 ]; then
# Set file1 variable as filename file1.fasta
# ensure this file exists in current path
# otherwise:
# file1=/path/to/file1.fasta
file1=file1.rasta;
# Set file2 variable as filename file2.fasta
# ensure this file exists in current path
# otherwise:
# file2=/path/to/file1.fasta
file2=file2.rasta;
line;
echo -e "No variables defined settings files as:\nfil1=$file1\nfile2=$file2";
line;
fi
# Check we have both files whether its variables or if not variables
# matches defined files
if [ ! -f $file1 ] || [ ! -f $file2 ]; then
line;
echo -e "One of the following: \n file1=$file1\nfile2=$file2\n does not exist!"
line
usage
exit 2;
fi
# Define file 2 variable which cats file2.fasta again like above ensure
# the file2.fasta can be catted from this path, it pipes it into sed and changes:
# '>gi' to 'Query=gi' and also changes '_ref_' to ' ref_'
# this now matches the same pattern as file1
cfile2=$(sed -e "s/>gi/Query=gi/g" -e "s/_ref_/ ref_/g" $file2);
# Set the internal field separator to \n which is the output of variable file2
IFS='\n';
# debug enable this if you now want to see manipulated file2
# echo $cfile2
# Echo out cfile2 which now with the above ifs makes it like the file
# formatting making \n the separator - pipe into awk command which
# matches against both files
# Set up a key whilst in one which contains pattern match after:
# .{number}_{space}* where this is what separates file2's content where tag starts.
# If the values from $2 match on both lines print out $0 which is everything from file1
# plus the key which contains the details
# the echo $cfile2 is then represented as - before $file1 at the end in effect its the first file value which is the call to file1
echo $cfile2| awk 'NR==FNR {
_[$2]=$2;
if( match($0, /\.[0-9]\_ /)) {
var1=substr($0, RSTART+3);
}
}
NR!=FNR {
if(_[$2] != "") print $0" "var1
}' - $file1
## Method used originally - updated to above which is much cleaner
## pattern matches and then from that point it captures entire string which would
## ensure it captures the entire tag from file2
##echo $cfile2| awk 'NR==FNR {
## _[$2]=$2;
## f1_line[key] = $4" "$5" "$6" "$7" "$8" "$9" "$10
## }
## NR!=FNR {
## if(_[$2] != "") print $0" "f1_line[key]
## }' - $file1