我需要扩展 Aho-Corasick 算法以匹配部分字典条目。为此,我在 JS 中修改了一个实现并进行了一些测试,但它还不能很好地工作
这是我的代码:
var AhoCorasick = function (keywords) {
this._buildTables(keywords);
};
AhoCorasick.prototype._buildTables = function (keywords) {
var gotoFn = {
0: {},
};
var output = {};
var state = 0;
keywords.forEach(function (word) {
var curr = 0;
for (var i = 0; i < word.length; i++) {
var l = word[i];
if (gotoFn[curr] && l in gotoFn[curr]) {
curr = gotoFn[curr][l];
} else {
state++;
gotoFn[curr][l] = state;
gotoFn[state] = {};
curr = state;
output[state] = [];
}
}
output[curr].push(word);
});
var failure = {};
var xs = [];
// f(s) = 0 for all states of depth 1 (the ones from which the 0 state can transition to)
for (var l in gotoFn[0]) {
var state = gotoFn[0][l];
failure[state] = 0;
xs.push(state);
}
while (xs.length) {
var r = xs.shift();
// for each symbol a such that g(r, a) = s
for (var l in gotoFn[r]) {
var s = gotoFn[r][l];
xs.push(s);
// set state = f(r)
var state = failure[r];
while (state > 0 && !(l in gotoFn[state])) {
state = failure[state];
}
if (l in gotoFn[state]) {
var fs = gotoFn[state][l];
failure[s] = fs;
output[s] = output[s].concat(output[fs]);
} else {
failure[s] = 0;
}
}
}
this.gotoFn = gotoFn;
this.output = output;
this.failure = failure;
};
AhoCorasick.prototype.search = function (string) {
//console.log(this.gotoFn);
var noErrorString = '';
var state = 0;
var results = [];
var h = 0;
var init = false;
aho_loop: for (var i = 0; i < string.length; i++) {
var l = string[i];
while (state > 0 && !(l in this.gotoFn[state])) {
state = this.failure[state];
noErrorString = '';
}
if (!(l in this.gotoFn[state])) {
if (h != 0 || !init) {
h++;
init = true;
var results2 = [];
for (key in this.gotoFn) {
for (subkey in this.gotoFn[key]) {
if (subkey == l) {
results2.push(this.gotoFn[key][l]);
}
}
}
}
console.log(l);
console.log(results2);
while (h < results2.length) {
state = results2[h];
noErrorString += l;
h++;
continue aho_loop;
}
h = 0;
init = false;
continue;
}
state = this.gotoFn[state][l];
noErrorString += l;
h = 0;
init = false;
if ((noErrorString.match(/ /g) || []).length > 1) {
var foundStrs = noErrorString;
if (string !== undefined) {
if (i + 1 < string.length) {
if (!string[i + 1].match(/\w+/g)) {
if (typeof string[i - foundStrs.length] === 'undefined') {
results.push([i, foundStrs]); // + 1
} else if (!string[i - foundStrs.length].match(/\w+/g)) {
results.push([i, foundStrs]); // + 1
}
}
} else {
results.push([i, foundStrs]);
}
}
}
}
return results;
};
var ac = new AhoCorasick([
'brownie t test keyword1 hello stop',
'lele brown',
'etc',
]);
var results = ac.search('test keyword1');
当我在字典中输入这个条目时:
布朗尼 t 测试关键字 1 你好停止
结果是:
"test keyword1 "
所以很好,但如果我输入这个:
布朗尼测试测试关键字1你好停止
结果是:
"st keyword1 "
当我输入这个时:
布朗尼 tt 测试关键字 1 你好停止
结果是空的...
有人可以帮我找到我的错误吗?
谢谢!