只是将代码库从 Lucene 3.6 更新到 Lucene 4.1,似乎我使用 NormalizeCharMap 替换分析器中的字符的测试不起作用。
下面我创建了一个独立的测试用例,这是我运行它时的输出
--term=and--
--term=gold--
--term=platinum--
name:"platinum and gold"
Size1
name:"platinum & gold"
Size0
java.lang.AssertionError:
Expected :1
Actual :0
<Click to see difference>
at org.junit.Assert.fail(Assert.java:93)
at org.junit.Assert.failNotEquals(Assert.java:647)
at org.junit.Assert.assertEquals(Assert.java:128)
at org.junit.Assert.assertEquals(Assert.java:472)
at org.junit.Assert.assertEquals(Assert.java:456)
at org.musicbrainz.search.analysis.Lucene41CharFilterTest.
testAmpersandSearching(Lucene41CharFilterTest.java:89)
正如您所看到的,charfilter 似乎确实有效,因为文本“白金和黄金”被转换为三个术语“白金和黄金”。事实上,搜索适用于“白金和黄金”,但不适用于原始“白金和黄金”,即使索引和搜索都使用相同的分析器
package org.musicbrainz.search.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.junit.Test;
import java.io.Reader;
import static org.junit.Assert.assertEquals;
public class Lucene41CharFilterTest
{
class SimpleAnalyzer extends Analyzer {
protected NormalizeCharMap charConvertMap;
protected void setCharConvertMap() {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("&","and");
charConvertMap = builder.build();
}
public SimpleAnalyzer() {
setCharConvertMap();
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new MusicbrainzTokenizer(Version.LUCENE_41,
new MappingCharFilter(charConvertMap, reader));
TokenStream filter = new LowerCaseFilter(Version.LUCENE_41,source);
return new TokenStreamComponents(source, filter);
}
}
@Test
public void testAmpersandSearching() throws Exception {
Analyzer analyzer = new SimpleAnalyzer();
RAMDirectory dir = new RAMDirectory();
IndexWriterConfig writerConfig = new
IndexWriterConfig(Version.LUCENE_41,analyzer);
IndexWriter writer = new IndexWriter(dir, writerConfig);
{
Document doc = new Document();
doc.add(new Field("name", "platinum & gold", Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.close();
IndexReader ir = DirectoryReader.open(dir);
Fields fields = MultiFields.getFields(ir);
Terms terms = fields.terms("name");
TermsEnum termsEnum = terms.iterator(null);
BytesRef text;
while((text = termsEnum.next()) != null) {
System.out.println("--term=" + text.utf8ToString()+"--");
}
ir.close();
IndexSearcher searcher = new IndexSearcher(IndexReader.open(dir));
{
Query q = new QueryParser(Version.LUCENE_41, "name", analyzer)
.parse("\"platinum and gold\"");
System.out.println(q);
TopDocs td = searcher.search(q, 10);
System.out.println("Size"+td.scoreDocs.length);
assertEquals(1, searcher.search(q, 10).totalHits);
}
searcher = new IndexSearcher(IndexReader.open(dir));
{
Query q = new QueryParser(Version.LUCENE_41, "name", analyzer)
.parse("\"platinum & gold\"");
System.out.println(q);
TopDocs td = searcher.search(q, 10);
System.out.println("Size"+td.scoreDocs.length);
assertEquals(1, searcher.search(q, 10).totalHits);
}
}
}