我已经阅读了一些文档并建立了一个看起来像的 lucene 索引
文件:
id 1
keyword foo bar
keyword john
id 2
keyword foo
id 3
keyword john doe
keyword bar foo
keyword what the hell
我想以某种方式查询 lucene,我可以在其中组合单个术语和短语。
假设我的查询是
foo bar
应该返回文档 ID 1、2 和 3
查询
"foo bar"
应该归还文档 ID 1
查询
john
应该返回文档 ID 1 和 3
查询
john "foo bar"
应该归还文档 ID 1
我在 java 中的实现不起作用。阅读大量文件也无济于事。
当我查询我的索引时
"foo bar"
我得到 0 次点击
当我查询我的索引时
foo "john doe"
我取回了 doc id 1、2 和 3(我希望只有 doc id 3,因为查询的意思是 foo 和“john doe”)问题是,“john doe”返回 0 次命中,但 foo 返回 3命中。
我的目标是结合单个术语和短语术语。我究竟做错了什么?我也玩过分析仪,但没有运气。
我的实现如下所示:
索引器
import ...
public class Indexer
{
private static final Logger LOG = LoggerFactory.getLogger(Indexer.class);
private final File indexDir;
private IndexWriter writer;
public Indexer(File indexDir)
{
this.indexDir = indexDir;
this.writer = null;
}
private IndexWriter createIndexWriter()
{
try
{
Directory dir = FSDirectory.open(indexDir);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwc.setRAMBufferSizeMB(256.0);
IndexWriter idx = new IndexWriter(dir, iwc);
idx.deleteAll();
return idx;
} catch (IOException e)
{
throw new RuntimeException(String.format("Could create indexer on directory [%s]", indexDir.getAbsolutePath()), e);
}
}
public void index(TestCaseDescription desc)
{
if (writer == null)
writer = createIndexWriter();
Document doc = new Document();
addPathToDoc(desc, doc);
addLastModifiedToDoc(desc, doc);
addIdToDoc(desc, doc);
for (String keyword : desc.getKeywords())
addKeywordToDoc(doc, keyword);
updateIndex(doc, desc);
}
private void addIdToDoc(TestCaseDescription desc, Document doc)
{
Field idField = new Field(LuceneConstants.FIELD_ID, desc.getId(), Field.Store.YES, Field.Index.ANALYZED);
idField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(idField);
}
private void addKeywordToDoc(Document doc, String keyword)
{
Field keywordField = new Field(LuceneConstants.FIELD_KEYWORDS, keyword, Field.Store.YES, Field.Index.ANALYZED);
keywordField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(keywordField);
}
private void addLastModifiedToDoc(TestCaseDescription desc, Document doc)
{
NumericField modifiedField = new NumericField(LuceneConstants.FIELD_LAST_MODIFIED);
modifiedField.setLongValue(desc.getLastModified());
doc.add(modifiedField);
}
private void addPathToDoc(TestCaseDescription desc, Document doc)
{
Field pathField = new Field(LuceneConstants.FIELD_PATH, desc.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(pathField);
}
private void updateIndex(Document doc, TestCaseDescription desc)
{
try
{
if (writer.getConfig().getOpenMode() == OpenMode.CREATE)
{
// New index, so we just add the document (no old document can be there):
LOG.debug(String.format("Adding testcase [%s] (%s)", desc.getId(), desc.getPath()));
writer.addDocument(doc);
} else
{
// Existing index (an old copy of this document may have been indexed) so
// we use updateDocument instead to replace the old one matching the exact
// path, if present:
LOG.debug(String.format("Updating testcase [%s] (%s)", desc.getId(), desc.getPath()));
writer.updateDocument(new Term(LuceneConstants.FIELD_PATH, desc.getPath()), doc);
}
} catch (IOException e)
{
throw new RuntimeException(String.format("Could not create or update index for testcase [%s] (%s)", desc.getId(),
desc.getPath()), e);
}
}
public void store()
{
try
{
writer.close();
} catch (IOException e)
{
throw new RuntimeException(String.format("Could not write index [%s]", writer.getDirectory().toString()));
}
writer = null;
}
}
搜索者:
import ...
public class Searcher
{
private static final Logger LOG = LoggerFactory.getLogger(Searcher.class);
private final Analyzer analyzer;
private final QueryParser parser;
private final File indexDir;
public Searcher(File indexDir)
{
this.indexDir = indexDir;
analyzer = new StandardAnalyzer(Version.LUCENE_34);
parser = new QueryParser(Version.LUCENE_34, LuceneConstants.FIELD_KEYWORDS, analyzer);
parser.setAllowLeadingWildcard(true);
}
public List<String> search(String searchString)
{
List<String> testCaseIds = new ArrayList<String>();
try
{
IndexSearcher searcher = getIndexSearcher(indexDir);
Query query = parser.parse(searchString);
LOG.info("Searching for: " + query.toString(parser.getField()));
AllDocCollector results = new AllDocCollector();
searcher.search(query, results);
LOG.info("Found [{}] hit", results.getHits().size());
for (ScoreDoc scoreDoc : results.getHits())
{
Document doc = searcher.doc(scoreDoc.doc);
String id = doc.get(LuceneConstants.FIELD_ID);
testCaseIds.add(id);
}
searcher.close();
return testCaseIds;
} catch (Exception e)
{
throw new RuntimeException(String.format("Could not search index [%s]", indexDir.getAbsolutePath()), e);
}
}
private IndexSearcher getIndexSearcher(File indexDir)
{
try
{
FSDirectory dir = FSDirectory.open(indexDir);
return new IndexSearcher(dir);
} catch (IOException e)
{
LOG.error(String.format("Could not open index directory [%s]", indexDir.getAbsolutePath()), e);
throw new RuntimeException(e);
}
}
}