我使用以下代码创建(并经常更新)用户索引(为了演示目的,这里稍微缩短了一点):
Lucene.Net.Store.Directory directory = FSDirectory.Open(new System.IO.DirectoryInfo("TestLuceneIndex"));
StandardAnalyzer standardAnalyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
IndexWriter indexWriter = new IndexWriter(directory, standardAnalyzer, IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
doc.Add(new Field("UID", uid, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
doc.Add(new Field("GENDER", gender, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
doc.Add(new Field("COUNTRY", countrycode, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
doc.Add(new Field("CITY", citycode, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
doc.Add(new Field("USERDATA", userdata, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
doc.Add(new Field("USERINFO", userinfo, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.UpdateDocument(new Term("UID", uid), doc);
indexWriter.Optimize();
indexWriter.Commit();
indexWriter.Close();
存储在索引中的值如下:
UID - 用户 ID(字符串 GUID) GENDER - 性别 ID(字符串“0”(未识别)“1”(男性)或“2”(女性) COUNTRY - 国家代码(字符串例如“US”、“FR”等) CITY - 城市代码(字符串“A121”、“C432”等) USERDATA - 用户详细信息的长字符串(类似于“John Doe j.doe@gmail.com 设计师高等教育5 年经验”) USERINFO - 关于用户的长字符串(类似于“我的名字是 John Doe。我出生......”)
然后我在索引中执行搜索。我会在两个字段(USERDATA 和 USERINFO)中进行搜索,并在必要时按 GENDER、COUNTRY 和 CITY 过滤结果。结果我检索了 UID(我需要这个值来识别数据库中用户记录的 id)。
这是我用于搜索的代码:
Lucene.Net.Store.Directory directory = Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo("TestLuceneIndex");
standardAnalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
Lucene.Net.Index.IndexReader indexReader = Lucene.Net.Index.IndexReader.Open(directory, true);
indexSearcher = new Lucene.Net.Search.IndexSearcher(indexReader);
Lucene.Net.Search.BooleanQuery booleanQuery = new Lucene.Net.Search.BooleanQuery();
Lucene.Net.QueryParsers.MultiFieldQueryParser queryTextParser = new Lucene.Net.QueryParsers.MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_29, new string[] { "USERDATA", "USERINFO" }, standardAnalyzer);
Lucene.Net.Search.Query queryText = queryTextParser.Parse(SearchText);
booleanQuery.Add(queryText, Lucene.Net.Search.BooleanClause.Occur.MUST);
if (searchGender != "0")
{
Lucene.Net.Index.Term termGender = new Lucene.Net.Index.Term("GENDER", searchGender);
Lucene.Net.Search.Query queryGender = new Lucene.Net.Search.TermQuery(termGender);
booleanQuery.Add(queryGender, Lucene.Net.Search.BooleanClause.Occur.MUST);
}
if (searchCity != "0")
{
Lucene.Net.Index.Term termCity = new Lucene.Net.Index.Term("CITY", searchCity);
Lucene.Net.Search.Query queryCity = new Lucene.Net.Search.TermQuery(termCity);
booleanQuery.Add(queryCity, Lucene.Net.Search.BooleanClause.Occur.MUST);
}
if (searchCountry != "0")
{
Lucene.Net.Index.Term termCountry = new Lucene.Net.Index.Term("COUNTRY", searchCountry);
Lucene.Net.Search.Query queryCountry = new Lucene.Net.Search.TermQuery(termCountry);
booleanQuery.Add(queryCountry, Lucene.Net.Search.BooleanClause.Occur.MUST);
}
Lucene.Net.Search.TopScoreDocCollector collector = Lucene.Net.Search.TopScoreDocCollector.create(indexReader.MaxDoc(), true);
indexSearcher.Search(booleanQuery, collector);
Lucene.Net.Search.ScoreDoc[] scoreDocs=collector.TopDocs().scoreDocs;
Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter("<b>", "</b>");
Lucene.Net.Highlight.QueryScorer queryScorer = new Lucene.Net.Highlight.QueryScorer(booleanQuery);
highlighter = new Lucene.Net.Highlight.Highlighter(formatter, queryScorer);
Lucene.Net.Highlight.Fragmenter fragmenter = new Lucene.Net.Highlight.SimpleFragmenter(150);
highlighter.SetTextFragmenter(fragmenter);
除了在使用多个单词时的相关性之外,一切都运行良好:例如,当我搜索(microsoft .net 程序员)时,包含确切子字符串的结果的得分并不高于在文本不同位置包含这些单词的结果。我理解,这是由于简单的事实造成的,即分数计算是基于文本中搜索字符串的百分比因子,而不是字符串重合的准确性。但是如何强制评分算法使资产准确性更有价值呢?即如何强制发现在计算相关性时被认为更重要的单词之间的距离?