1

我很好奇处理标记化/索引术语(在 Lucene 中)或任何搜索引擎的最佳方法是什么,以便这些搜索匹配相应的术语。

“12”=“十二”

“mx1” = “mx 一个”

有没有我忽略的内置功能?

4

2 回答 2

1

你看过 Lucene SynonymFilter吗?

于 2012-03-09T23:04:19.863 回答
1

Lucene 中最简单的方法是创建 2 个单独的标记过滤器,以便在初始字符串被标记化后使用。第一个需要在数字序列和非数字序列之间进行拆分。然后第二个将数字(数字字符串)转换为数字(拼写)数字。

这是 PyLucene 的一个示例(不包括偏移和位置属性逻辑):

class AlphaNumberBoundaryFilter(lucene.PythonTokenFilter):
    seq = re.compile(r"((?:\d+")|(?:\D+))")

    def __init__(self, in_stream):
        lucene.PythonTokenFilter.__init__(self, in_stream)
        term = self.term = self.addAttribute(lucene.TermAttribute.class_)
        # Get tokens.
        tokens = []
        while in_stream.incrementToken():
            tokens.append(term.term())
        # Filter tokens.
        self.tokens = self.filter(tokens)
        # Setup iterator.
        self.iter = iter(self.tokens)

    def filter(self, tokens):
        seq = self.seq
        return [split for token in tokens for split in seq.findall(token)]

    def incrementToken(self):
        try:
            self.term.setTermBuffer(next(self.iter))
        except StopIteration:
            return False
        return True


class NumberToWordFilter(lucene.PythonTokenFilter):
    num_map = {0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand", 1000000: "million"}
    is_num = re.compile(r"^\d+$")

    def __init__(self, in_stream):
        lucene.PythonTokenFilter.__init__(self, in_stream)
        term = self.term = self.addAttribute(lucene.TermAttribute.class_)
        # Get tokens.
        tokens = []
        while in_stream.incrementToken():
            tokens.append(term.term())
        # Filter tokens.
        self.tokens = self.filter(tokens)
        # Setup iterator.
        self.iter = iter(self.tokens)

    def filter(self, tokens):
        num_map = self.num_map
        is_num = self.is_num
        final = []
        for token in tokens:
            if not is_num.match(token):
                final.append(token)
                continue
            # Reverse digits from token.
            digits = token.lstrip('0')[::-1]
            if not digits:
                # We have a zero.
                final.append(num_map[0])
                continue
            # Group every 3 digits and iterate over digit groups in reverse
            # so that groups are yielded in the original order and in each
            # group: 0 -> ones, 1 -> tens, 2 -> hundreds
            groups = [digits[i:i+3] for i in xrange(0, len(digits), 3)][::-1]
            scale = len(groups) - 1
            result = []
            for oth in groups:
                l = len(oth)
                if l == 3 and oth[2] != '0':
                    # 2 -> x
                    # 1 -> .
                    # 0 -> .
                    result.append(num_map[int(oth[2])])
                    result.append(num_map[100])
                if l >= 2:
                    if oth[1] == '1':
                        # 1 -> 1
                        # 0 -> x
                        result.append(num_map[int(oth[1::-1])])
                    else:
                        if oth[1] != '0':
                            # 1 -> x (x >= 2)
                            # 0 -> x
                            result.append(num_map[int(oth[1]) * 10])
                        if oth[0] != '0':
                            result.append(num_map[int(oth[0])])
                elif oth[0] != '0':
                    # 0 -> x
                    result.append(num_map[int(oth[0])])
                # Add scale modifier.
                s = scale
                if s % 2:
                    result.append(num_map[1000])
                while s >= 2:
                    result.append(num_map[1000000])
                    s -= 2
                scale -= 1
            final.extend(result)
        return final 


    def incrementToken(self):
        try:
            self.term.setTermBuffer(next(self.iter))
        except StopIteration:
            return False
        return True
于 2012-03-23T04:39:31.343 回答