一种使用方式awk
。这不是一个简单的脚本。简短解释该过程:关键点是变量'all_ranges',当 reset 从保存其数据的范围文件中读取时,设置时停止该过程并开始从 'id-position' 文件中读取,检查数据中的位置数组并打印是否与范围匹配。我试图避免多次处理范围文件并按块进行处理,这使得它更加复杂。
编辑添加我假设id
两个文件中的字段都已排序。否则这个脚本会惨遭失败,你将需要另一种方法。
内容script.awk
:
BEGIN {
## Arguments:
## ARGV[0] = awk
## ARGV[1] = <first_input_argument>
## ARGV[2] = <second_input_argument>
## ARGC = 3
f2 = ARGV[ --ARGC ];
all_ranges = 0
## Read first line from file with ranges to get 'class' header.
getline line <f2
split( line, fields )
class_header = fields[2];
}
## Special case for the header.
FNR == 1 {
printf "%s\t%s\n", $0, class_header;
next;
}
## Data.
FNR > 1 {
while ( 1 ) {
if ( ! all_ranges ) {
## Read line from file with range positions.
ret = getline line <f2
## Check error.
if ( ret == -1 ) {
printf "%s\n", "ERROR: " ERRNO
close( f2 );
exit 1;
}
## Check end of file.
if ( ret == 0 ) {
break;
}
## Split line in spaces.
num = split( line, fields )
if ( num != 4 ) {
printf "%s\n", "ERROR: Bad format of file " f2;
exit 2;
}
range_id = fields[1];
if ( $1 == fields[1] ) {
ranges[ fields[3], fields[4] ] = fields[2];
continue;
}
else {
all_ranges = 1
}
}
if ( range_id == $1 ) {
delete ranges;
ranges[ fields[3], fields[4] ] = fields[2];
all_ranges = 0;
continue;
}
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
break;
}
}
END {
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
}
像这样运行它:
awk -f script.awk file1 file2 | column -t
结果如下:
id position class
a1 21 Xfact
a1 39 Xfact
a1 77 xbreak
b1 88 Xbreak
b1 122 Xbreak
c1 22 Xbreak