我正在尝试将一组来自 Adblock Plus 规则的正则表达式转换为可以从 C++ 调用的优化函数。
我期望能够使用诸如 Ragel 之类的词法分析器生成器来执行此操作,但是当我尝试使用非常小的一组正则表达式时,内存使用量变得非常高 > 30 GB,并且 Ragel 退出时没有错误消息并且没有生成输出文件。
我在下面包含了玩具语法,我试图了解我是否在做任何可以优化以解决问题的愚蠢行为。
#include <string.h>
namespace xab{
%%{
machine lexer;
WILDCARD = /[A-Za-z0-9;\/\?:@=&$_\.\+!\*'~#^,%:\-]/*;
SUBDOMAIN = /([A-Za-z]([A-Za-z0-9\-]*[A-Za-z0-9])?\.)+/;
SEPERATOR = /[:\/\?=&]/;
main :=
(WILDCARD '&prvtof=' WILDCARD '&poru=' WILDCARD) |
(WILDCARD '.a3s?n=' WILDCARD '&zone_id=' WILDCARD) |
(WILDCARD '/addyn|' WILDCARD ';adtech;' WILDCARD) |
(WILDCARD '/addyn|' WILDCARD '|adtech;' WILDCARD) |
(WILDCARD '/adiframe|' WILDCARD '|adtech;' WILDCARD) |
(WILDCARD '/adserv|' WILDCARD '|adtech;' WILDCARD) |
(WILDCARD '/affiliates.' WILDCARD '.aspx?' WILDCARD) |
(WILDCARD '/affiliates/' WILDCARD '/show_banner.' WILDCARD) |
(WILDCARD '/banner_js.' WILDCARD '?' WILDCARD) |
(WILDCARD '/bannerframe.' WILDCARD '?' WILDCARD) |
(WILDCARD '/banners.' WILDCARD '&iframe=' WILDCARD) |
(WILDCARD '/bannerview.' WILDCARD '?' WILDCARD) |
(WILDCARD '/bannery/' WILDCARD '?banner=' WILDCARD) |
(WILDCARD '/cdn-cgi/pe/bag?r[]=' WILDCARD 'cpalead.com' WILDCARD) |
(WILDCARD '/delivery/' WILDCARD '?advplaces=' WILDCARD) |
(WILDCARD '/eas?camp=' WILDCARD ';cre=' WILDCARD) |
(WILDCARD '/eas?cu=' WILDCARD ';cre=' WILDCARD) |
(WILDCARD '/eas?cu=' WILDCARD ';ord=' WILDCARD) |
(WILDCARD '/ireel/ad' WILDCARD '.jpg' WILDCARD) |
(WILDCARD '/is.php?ipua_id=' WILDCARD '&search_id=' WILDCARD);
write data;
}%%
bool matchBlacklist(const char *data) {
const char *p = data;
const char *pe = data + strlen(data);
int cs;
//write init
%% write init;
// write exec
%% write exec;
if (cs >= lexer_first_final)
return true;
return false;
}
}