也可以awk
用于此;说你有:
$ cat a.csv
#product_id,product_name,brand_name,price
1,pname1,bname1,100
10,pname10,bname10,200
20,pname20,bname20,300
$ cat b.csv
#product_id,product_category,product_name,brand_name,price
3,pcat3,pname3,bname3,42
10,pcat10,pname10,bname10,199
20,pcat20,pname20,bname20,299
30,pcat10,pname30,bname30,420
使用“FNR==NR”方法(参见例如> The Unix shell: comparison two files with awk):
$ awk -F, 'FNR==NR{if(!/^#/){a[$1]=$0;next}}($1 in a){split(a[$1],tmp,",");printf "%d,%s,%s,%s,%d\n",$1,$2,$3,$4,tmp[4];}' a.csv b.csv
10,pcat10,pname10,bname10,200
20,pcat20,pname20,bname20,300
将每个文件读入一个数组(参见例如Awking it – how to load a file into an array in awk | Tapping away):
$ awk -F, 'BEGIN{while(getline < "a.csv"){if(!/^#/){a[$1]=$0;}}close("a.csv");while(getline < "b.csv"){if($1 in a){split(a[$1],tmp,",");printf "%d,%s,%s,%s,%d\n",$1,$2,$3,$4,tmp[4];}}close("b.csv");}'
10,pcat10,pname10,bname10,200
20,pcat20,pname20,bname20,300
本质上,这两种方法做同样的事情:
- 读取第一个文件 (
a.csv
),并将其行存储在关联数组a
中,由该行的第一个字段键控/索引$1
(在本例中为product_id
);
- 然后读取第二个文件(
b.csv
);并且如果在数组中找到其每一行的第一个字段a
;然后输出当前行的前四个字段b.csv
;price
和数组中相应条目的第四个字段 ( )a
不同之处在于,使用该FNR==NR
方法,将命令行上的输入文件指定为 的参数awk
,基本上您只能将第一个文件标识为“特殊”,因此您可以将其存储为数组;使用第二种方法,每个输入文件都可以在一个单独的数组中解析 - 但是,输入文件是在awk
脚本本身中指定的,而不是在参数中awk
- 从那时起你甚至不需要使用参数awk
,整体的awk
脚本需要在一个BEGIN{...}
块内发生。
当从文件中读取行时,它们会根据-F,
命令行选项自动拆分为字段,将逗号设置为分隔符;但是,当检索存储在数组中的行时,我们必须将split()
它们分开
第一个细分:
FNR==NR # if FNR (input record number in the current input file) equals NR (total num records so far)
# only true when the first file is being read
{
if(!/^#/) # if the current line does not `!` match regex `/.../` of start `^` with `#`
{
a[$1]=$0; # assign current line `$0` to array `a`, with index/key being first field in current line `$1`
next # skip the rest, and start processing next line
}
}
# --this section below executes when FNR does not equal NR;--
($1 in a) # first, check if first field `$1` of current line is in array `a`
{
split(a[$1],tmp,","); # split entry `a[$1]` at commas into array `tmp`
printf "%d,%s,%s,%s,%d\n",$1,$2,$3,$4,tmp[4]; # print reconstructed current line,
# taking the fourth field from the `tmp` array
}
第二个细分:
BEGIN{ # since no file arguments here, everything goes in BEGIN block
while(getline < "a.csv"){ # while reading lines from first file
if(!/^#/){ # if the current line does not `!` match regex `/.../` of start `^` with `#`
a[$1]=$0; # store current line `$0` to array `a`, with index/key being first field in current line `$1`
}
}
close("a.csv");
while(getline < "b.csv"){ # while reading lines from second file
if($1 in a){ # first, check if first field `$1` of current line is in array `a`
split(a[$1],tmp,","); # (same as above)
printf "%d,%s,%s,%s,%d\n",$1,$2,$3,$4,tmp[4]; # (same as above)
}
}
close("b.csv");
} # end BEGIN
关于执行的注意事项FNR==NR
:
$ awk -F, 'FNR==NR{print "-";} (1){print;}' a.csv b.csv # or:
$ awk -F, 'FNR==NR{print "-";} {print;}' a.csv b.csv
-
#product_id,product_name,brand_name,price
-
1,pname1,bname1,100
-
10,pname10,bname10,200
-
20,pname20,bname20,300
#product_id,product_category,product_name,brand_name,price
3,pcat3,pname3,bname3,42
10,pcat10,pname10,bname10,199
20,pcat20,pname20,bname20,299
30,pcat10,pname30,bname30,420
$ awk -F, 'FNR==NR{print "-";} FNR!=NR{print;}' a.csv b.csv
-
-
-
-
#product_id,product_category,product_name,brand_name,price
3,pcat3,pname3,bname3,42
10,pcat10,pname10,bname10,199
20,pcat20,pname20,bname20,299
30,pcat10,pname30,bname30,420
这意味着“当 FNR 不等于 NR 时执行下面的这部分; ”上面的评论原则上是错误的——即使那是该特定示例最终的行为方式。