regex - 按文本文件中的行长对哈希值进行排序

Question

读取file1中的行，如果file2中不存在该行，则将此行写入新文件。
通过散列字符串的长度进行比较。

file1中的哈希值：

cf03189f5b05eb1a9658f80d7a0e9f02:_#.g}
edbe6de8b3ee19b45e092147f57af7b8:]mNon
47253940f843f258ffd265d13f365d70:/u'yv
5701aa8e2aa7e1cfd16ca4076bd1732a:@AQ1z
b3c0866e6fd56776bc4a18d3c87cc725:t$5OV
7a1e6090568e076c55df9dc7abf356c6:9rC@p
04046da33706518d9b15a38bcddb448e:!DFPk

file2中的哈希值：

edbe6de8b3ee19b45e092147f57af7b8:]mNon:str1
47253940f843f258ffd265d13f365d70:/u'yv:2str
b3c0866e6fd56776bc4a18d3c87cc725:t$5OV:3str1ng

这是一个用 C 编写的工作示例：

#include <stdio.h>
#include <string.h>

#define LINE_LENGTH 80
typedef enum {FALSE=0,TRUE=1} BOOL;
int main(int argc, char *argv[])
{
    FILE   *fpin1 = NULL;
    FILE   *fpin2 = NULL;
    FILE   *fpout = NULL;
    char   line [LINE_LENGTH]={0};
    char   line1[LINE_LENGTH]={0};
    BOOL   bCheck = FALSE;
    size_t ncHash = 0;
    size_t count = 0;

    if (argc != 4)
    {
        printf("Usage:%s <file1> <file2> <OutFile>\n", argv[0]);
        return 1;
    }
    /*
      Opening input files (file1 and file2) for reading in text mode.
      The output file (OutFile) is open for writing.
    */
    if(((fpin1=fopen(argv[1],"r"))==NULL) ||
       ((fpin2=fopen(argv[2],"r"))==NULL) ||
       ((fpout=fopen(argv[3],"w"))==NULL))
    {
        printf("Error! Could not open files\n");
        return 1;
    }    
    while(fgets(line, sizeof(line), fpin1)!=NULL) /* Read hash line from the first file1 */
    {
        bCheck=FALSE;
        while(fgets(line1, sizeof(line1), fpin2)!=NULL) /* Read hash line from the second file2 */
        {
            if(!strncmp(line, line1, 38)) /* Compares 38 characters of the line in file1 to those of the file2 */
            {
                bCheck=!bCheck;
                break;
            }
        }
        if(!bCheck) /* Does compared line are the same ? */
        {
            fputs(line,fpout); /* Yes - write them in a file OutFile */
            ncHash++; /* Count identical lines */
        }
        rewind(fpin2); /* Seek to the beginning of the file2 */
        count++;      /* Counting the read lines in file1 */
    }
    printf("\nDone...\n");

    fclose( fpin1);
    fclose( fpin2);
    fclose( fpout);
    return 0;
}

该程序的OutFile是：

cf03189f5b05eb1a9658f80d7a0e9f02:_#.g}
5701aa8e2aa7e1cfd16ca4076bd1732a:@AQ1z
7a1e6090568e076c55df9dc7abf356c6:9rC@p
04046da33706518d9b15a38bcddb448e:!DFPk

我想知道怎么写awk？

score 3 · Accepted Answer

awk -F: 'NR==FNR{a[$1,$2];next}!(($1,$2) in a)' excludes.txt all.txt

注意文件参数顺序的反转。

解释：
-F:-:用作字段分隔符
NR==FNR- 第一个文件（当前行号 = 总行号）
a[$1,$2]- 触摸前两个字段的数组 next- 移动到下一行，因此我们不必检查备用NR != FNR条件
!(($1,$2) in a)- 检查是否看到了组合。如果不是，则打印该行（默认操作）

score 2 · Accepted Answer

GNU sed的代码：

sed -r 'sµ(.*):.*$µ\\§\1§dµ' file2 |sed -f - file1

由于许多“丑陋”字符，代码仅供参考，请勿在生产中使用。

score 1 · Accepted Answer

我想你的意思是awk。可以这样做，但它会占用 2 倍于文件大小的内存：

cat file1 file2 |
awk '{ s = substr($0, 1, 38); str[NR] = s; ex[s]++; } 
END {
       for (i = 1; i <= NR; i++) {
          s = str[i];
          if (ex[s] == 1)
             print s;
       }
}'

输出：

cf03189f5b05eb1a9658f80d7a0e9f02:_#.g}
5701aa8e2aa7e1cfd16ca4076bd1732a:@AQ1z
7a1e6090568e076c55df9dc7abf356c6:9rC@p
04046da33706518d9b15a38bcddb448e:!DFPk

score 1 · Accepted Answer

在 perl 中

perl -F: -ane'BEGIN{$f=$ARGV[0]}print if$ARGV ne$f&&!$h{$F[0]};$h{$F[0]}=1' file2 file1

regex - 按文本文件中的行长对哈希值进行排序

4 回答 4

Related

Reference