2

我有以下程序:

public class RegexQueryExample {

    public static String[] terms = {
        "US $65M dollars",
        "USA",
        "$35",
        "355",
        "US $33",
        "U.S.A",
        "John Keates",
        "Tom Dick Harry",
        "Southeast' Asia"
    };
    private static Directory directory;

    public static void main(String[] args) throws CorruptIndexException, IOException {
        String searchString = ".*\\$.*";
        createIndex();
        searchRegexIndex(searchString);
    }

    /**
     * Creates an index for the files in the data directory.
     */
    private static void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        directory = new RAMDirectory();
        IndexWriter indexWriter = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);


        for (String term : terms) {
            Document document = new Document();
            if (term.indexOf('$') >= 0) {
                document.add(new Field("type", "currency", Field.Store.YES, Field.Index.NOT_ANALYZED));
            } else {
                document.add(new Field("type", "simple_field", Field.Store.YES, Field.Index.NOT_ANALYZED));
            }
            document.add(new Field("term", term, Field.Store.YES, Field.Index.NOT_ANALYZED));
            indexWriter.addDocument(document);
        }

        indexWriter.close();
    }

    /**
     * searches for a regular expression satisfied by a file path.
     *
     * @param searchString the string to be searched.
     */
    private static void searchRegexIndex(String regexString) throws CorruptIndexException, IOException {
        regexString = regexString;
        IndexSearcher searcher = new IndexSearcher(directory);

        RegexQuery rquery = new RegexQuery(new Term("term", regexString));
        BooleanQuery queryin = new BooleanQuery();
        BooleanQuery query = new BooleanQuery();
        query.add(new TermQuery(new Term("type", "simple_field")), BooleanClause.Occur.MUST);
        query.add(rquery, BooleanClause.Occur.MUST);

        TopDocs hits = searcher.search(query, terms.length);
        ScoreDoc[] alldocs = hits.scoreDocs;
        for (int i = 0; i < alldocs.length; i++) {
            Document d = searcher.doc(alldocs[i].doc);
            System.out.println((i + 1) + ". " + d.get("term"));
        }
    }
}

该函数在执行正则表达式查询createIndex()时创建 Lucene 索引。searchRegexIndex()main()函数中,我搜索.*\\$.*期望它返回包含$符号的术语。但是,它没有用。我如何使它工作?这是分析仪的问题吗?

编辑:

我来自 Luke 的 Lucene 索引快照:

Lucene 索引

4

1 回答 1

4

您正在使用 StandardAnalyzer,它会从令牌中删除美元符号。例如,“6500 万美元”变成三个代币:“我们”、“6500 万美元”、“美元”。您需要使用另一个不会删除美元符号的分析器。Luke提供了一个出色的分析器工具,您可以在其中尝试不同的分析器并检查它们的输出。

于 2012-07-03T20:30:27.400 回答