1

我想获取 Lucene 4 索引中每个术语的出现次数(有一些关于如何使用Lucene 版本执行此操作的示例,但正如我们所知,它们会随着每个新版本完全更改其 API,所以那些如何- tos 根本没有帮助我)。

我现在拥有的:

IndexReader indexReader = DirectoryReader.open(directory);
Fields fields = MultiFields.getFields(indexReader);
for (String field : fields) {
    Terms terms = fields.terms(field);
    TermsEnum termEnum = terms.iterator(null);
    BytesRef bytesRef;
    while ((bytesRef = termEnum.next()) != null) {
        // what to do here? I have a BytesRef and there is 
        // IndexReader#docFreq(term) but it requires a Term
    }
}
4

1 回答 1

4
    FSDirectory directory = FSDirectory.open(new File("/tmp/moo"));
    /*
    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));
    Document document = new Document();
    document.add(new TextField("foo", "abc", Store.YES));
    document.add(new TextField("foo", "abc", Store.YES));
    document.add(new TextField("foo", "aaa", Store.YES));
    document.add(new TextField("bar", "abc", Store.YES));
    writer.addDocument(document);
    writer.commit();
    writer.close(true);
    */

    IndexReader indexReader = DirectoryReader.open(directory);

    Bits liveDocs = MultiFields.getLiveDocs(indexReader);
    Fields fields = MultiFields.getFields(indexReader);
    for (String field : fields) {

        TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
        BytesRef bytesRef;
        while ((bytesRef = termEnum.next()) != null) {
            if (termEnum.seekExact(bytesRef, true)) {

                DocsEnum docsEnum = termEnum.docs(liveDocs, null);

                if (docsEnum != null) {
                    int doc;
                    while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                        System.out.println(bytesRef.utf8ToString() + " in doc " + doc + ": " + docsEnum.freq());
                    }
                }
            }
        }
    }

    for (String field : fields) {
        TermsEnum termEnum = MultiFields.getTerms(indexReader, field).iterator(null);
        BytesRef bytesRef;
        while ((bytesRef = termEnum.next()) != null) {
            int freq = indexReader.docFreq(new Term(field, bytesRef));

            System.out.println(bytesRef.utf8ToString() + " in " + freq + " documents");

        }
    }
于 2013-10-17T10:31:41.583 回答