0

我在尝试查找电子邮件地址时遇到了问题。我不知道可能是什么问题:(

static int contains_mail(const unsigned char *buffer, int length, int detmode)
{
    const char *reg_exp = "([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z0-9._%+-]+)";

    regex_t regex;
    int reti;
    regmatch_t matches[2];

    int start0, end0, start1, end1;

    reti = regcomp(&regex, reg_exp, REG_EXTENDED);

    if(reti){ fprintf(stderr, "Could not compile regex\n"); exit(1); }

    reti = regexec(&regex, buffer, 2, matches, 0);

    start0 = matches[0].rm_so;
    end0 = matches[0].rm_eo;
    start1 = matches[1].rm_so;
    end1 = matches[1].rm_eo;

    printf("start0: %d", start0);
    printf("end0: %d", end0);
    printf("start1: %d", start1);
    printf("end1: %d", end1);

    if( !reti ){
        //printf("1");
        return 1;
    } else {
        //printf("0");
        return 0;
    }
}

示例输入文件:

dfo gpdf eriowepower riwope d@b.pl rwepoir weporsdfi dsfdfasdas@sdfaasdas.pl OSIDQOPWIEPOQWIE sdfs@asdsa.pl
WERO IWUEOIRU OWIERU WOIER asdas@asdasd.pl
aposidasop aposdi aspod iaspodi aspoid aspodi sdfsddfsd@asdasd.pl
werowerowe

看起来它开始于:

start0: 28end0: 28start1: 1end1: 8

但是看起来它不知道电子邮件的结尾是什么所以我无法计算它:(

4

2 回答 2

1

一个简单的问题,你是如何传入输入文件的?好像我定义然后调用它,如下所示:

char string[] = "dfo gpdf eriowepower riwope d@b.pl rwepoir weporsdfi dsfdfasdas@sdfaasdas.pl OSIDQOPWIEPOQWIE sdfs@asdsa.pl\n\
WERO IWUEOIRU OWIERU WOIER asdas@asdasd.pl\n\
aposidasop aposdi aspod iaspodi aspoid aspodi sdfsddfsd@asdasd.pl\n\
werowerowe\n";

contains_mail(string, 0, 0);

并修改您的contains_mail函数以重复调用regexec如下:

reti = regexec(&regex, buffer, 2, matches, 0);
while (reti == 0) {
        start0 = matches[0].rm_so;
        end0 = matches[0].rm_eo;
        start1 = matches[1].rm_so;
        end1 = matches[1].rm_eo;

        printf("start0: %d ", start0);
        printf("end0: %d\n", end0);
        printf("start1: %d ", start1);
        printf("end1: %d\n", end1);
        printf("email: %.*s\n", end1 - start1, buffer + start1);
        buffer += end1;
        reti = regexec(&regex, buffer, 2, matches, REG_NOTBOL);
} 

我得到了所有的比赛:

$ ./email_regex
start0: 28 end0: 34
start1: 28 end1: 34
email: d@b.pl
start0: 19 end0: 42
start1: 19 end1: 42
email: dsfdfasdas@sdfaasdas.pl
start0: 18 end0: 31
start1: 18 end1: 31
email: sdfs@asdsa.pl
start0: 28 end0: 43
start1: 28 end1: 43
email: asdas@asdasd.pl
start0: 47 end0: 66
start1: 47 end1: 66
email: sdfsddfsd@asdasd.pl

我同意其他人的评论,您的正则表达式可能不是获取电子邮件地址的最佳选择。但你实际上想做什么?

于 2014-05-07T14:23:07.920 回答
1

该函数regexec始终最多只能找到一次您的正则表达式。第一个匹配项(索引为 0)包含整个匹配项的开始结束位置,以下匹配项包含括号中子表达式的数据。(您的示例中整个表达式周围的括号没有任何用途,但它们导致匹配 0 和 1 的相同位置,可能会误导您认为相同的表达式被一遍又一遍地解析。)

您可以在 while 循环中查找您的表达式,在成功匹配后向前移动指针以查找更多电子钉地址。您的代码的以下修改会打印找到的所有电子邮件地址并返回匹配的数量。

static int contains_mail(const char *buffer, int length)
{
    const char *reg_exp =
        "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z0-9._%+-]+";

    regex_t regex;
    regmatch_t match;
    int count = 0;

    if (regcomp(&regex, reg_exp, REG_EXTENDED) < 0) {
        fprintf(stderr, "Could not compile regex\n");
        exit(1);
    }

    while (regexec(&regex, buffer, 1, &match, 0) == 0) {
        int start = match.rm_so;
        int end = match.rm_eo;

        printf("%.*s\n", end - start, buffer + start);
        count++;
        buffer = buffer + end;
    }

    return count;
}
于 2014-05-07T14:29:35.357 回答