linux - 如何找到两个文本文件之间的匹配模式并输出到另一个文件？

Question

我有两个文本组织不同的文本文件。这两个文件在文本中都包含几个相同的模式（数字）。我想找出两个文件中都存在哪些模式（数字）并将它们写入输出文件。

文件 1.txt：

blablabla_25947.bkwjcnwelkcnwelckme

blablabla_111.bkwjcnwelkcnwelckme

blablabla_65155.bkwjcnwelkcnwelckme

blablabla_56412.bkwjcnwelkcnwelckme

文件2.txt：

blablabla_647728.bkwjcnwelkcnwelck
kjwdhcwkejcwmekcjwhemckwejhcmwekch

blablabla_6387.bkwjcnwelkcnwelckme
wexkwhenqlciwuehnqweiugfnwekfiugew
wedhwnejchwenckhwqecmwequhcnkwjehc
owichjwmelcwqhemclekcelmkjcelkwejc

blablabla_59148.bkwjcnwelkcnwelckme
ecmwequhcnkwjehcowichjwmelcwqhemcle
kcelmkjcelkwejcwecawecwacewwAWWAXEG

blablabla_111.bkwjcnwelkcnwelckm
WESETRBRVSSCQEsfdveradassefwaefawecc

输出文件.txt：

score 1 · Accepted Answer

怎么样：

$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.'
111

# Redirect to new file
$ egrep -o '_[0-9]+\.' file1 | grep -of - file2 | tr -d '_.' > file3

首先从grep中获取所有数字字符串（前面_和前面.），此列表file1用于. 和被剥离使用.grepfile2_.tr

score 0 · Accepted Answer

事实上，我确实试图解决我认为你提出的“难题”。以下代码查找在 file1 和 file2 中找到的最长字符串。如果有多个“最长”字符串，它只报告找到的第一个。在某些时候可能对某人有帮助（尽管可能不是您在此处寻找的解决方案）：

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <sys/stat.h>

/* This routine returns the size of the file it is called with. */

static unsigned
get_file_size (const char * file_name)
{
    struct stat sb;
    if (stat (file_name, & sb) != 0) {
        fprintf (stderr, "'stat' failed for '%s': %s.\n",
                 file_name, strerror (errno));
        exit (EXIT_FAILURE);
    }
    return sb.st_size;
}

/* This routine reads the entire file into memory. */

static unsigned char *
read_whole_file (const char * file_name)
{
    unsigned s;
    unsigned char * contents;
    FILE * f;
    size_t bytes_read;
    int status;

    s = get_file_size (file_name);
    contents = malloc (s + 1);
    if (! contents) {
        fprintf (stderr, "Not enough memory.\n");
        exit (EXIT_FAILURE);
    }

    f = fopen (file_name, "r");
    if (! f) {
        fprintf (stderr, "Could not open '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    bytes_read = fread (contents, sizeof (unsigned char), s, f);
    if (bytes_read != s) {
        fprintf (stderr, "Short read of '%s': expected %d bytes "
                 "but got %d: %s.\n", file_name, s, bytes_read,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    status = fclose (f);
    if (status != 0) {
        fprintf (stderr, "Error closing '%s': %s.\n", file_name,
                 strerror (errno));
        exit (EXIT_FAILURE);
    }
    return contents;
}

int main(int argc, char* argv[]){
    int i1, i2, l1, l2, lm;
    unsigned char longestString[1000]; // lazy way to make big enough.
    unsigned char tempString[1000];
    int longestFound=0;
    unsigned char *f1, *f2; // buffers with entire file contents
    f1  = read_whole_file (argv[1]);
    f2  = read_whole_file (argv[2]);

    l1 = strlen(f1);
    l2 = strlen(f2);

    for(i1 = 0; i1 < l1; i1++) {
        lm = 0;// length of match
        for(i2 = i1; i2<l2; i2++) {
            lm = 0;

            while (f1[i1+lm] == f2[i2+lm] && (i1+lm<l1) && (i2+lm<l2) && lm < 1000-1) {
                tempString[lm] = f1[i1+lm];
                lm++;
            }

            if (lm > longestFound) {
                tempString[lm]=0; // terminate string
                strcpy(longestString, tempString);
                longestFound = lm;
            }
        }

    }

    printf("longest string found is %d characters:\n", longestFound);
    printf("%s\n", longestString);
    free(f1);
    free(f2);
    return 0;
}

读取整个文件内容的代码位于http://www.lemoda.net/c/read-whole-file/index.html

linux - 如何找到两个文本文件之间的匹配模式并输出到另一个文件？

2 回答 2

Related

Reference