我有一个索引存储,其中包含各种短语,但有些拼写错误,例如:-
"Hallo World"
"Qick Brown Fox"
"Long tame no sees"
我正在使用布尔查询运行模糊搜索,但如果有匹配项,我不知道如何检索匹配的短语。我不只是想要从索引存储中轻松获得的短语。
例如,如果我的搜索是(“Hello World”或“Keep it Real”或“Long Time no See”)的布尔查询,它将返回“Hallo World”和“Long tame no sees”。但是我还需要知道我原来的短语“Hello World”和“Long Time no See”是匹配的。有任何想法吗?我的搜索方法如下(当前返回短语匹配但拼写错误):-
public IEnumerable<DocumentWrapper> Search(List<string> fieldsToSearch, List<string> phrases, string path)
{
var luceneDir = FSDirectory.Open(new DirectoryInfo(path));
if (luceneDir == null)
{
Logging.Error("Invalid Index Directory during Search");
return null;
}
Logging.Information(string.Format("Searching for phrase(s): {0}", phrases.ToSeparatedList()));
var analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexReader indexReader;
try
{
indexReader = IndexReader.Open(luceneDir, true);
}
catch (FileNotFoundException)
{
Logging.Error("Lucene Directory has no items");
return null;
}
var matchedDocuments = new List<DocumentWrapper>();
try
{
var searcher = new IndexSearcher(indexReader);
MultiFieldQueryParser queryParser = null;
if (fieldsToSearch != null)
{
queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fieldsToSearch.ToArray(), analyzer);
}
else
{
queryParser = new MultiFieldQueryParser(Version.LUCENE_30,
indexReader.GetFieldNames(
IndexReader.FieldOption.INDEXED_NO_TERMVECTOR).ToArray(),
analyzer);
}
//Add a fuzzy search using a tilde on each word in the phrase then 'AND' them together.
var booleanQueries = new List<BooleanQuery>();
foreach (var phrase in phrases)
{
var terms = phrase.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries);
var booleanQuery = new BooleanQuery();
foreach (var term in terms)
{
booleanQuery.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.MUST);
//'AND' the words in the phrase together.
}
booleanQueries.Add(booleanQuery);
}
//combine our boolean queries with 'OR' i.e. (word1 and word2 and word3) OR (word4 and word5) OR (word 6)
var combined = new BooleanQuery();
foreach (var query in booleanQueries)
{
combined.Add(query, Occur.SHOULD);
}
var maxHits = 100; //TODO - configurable
var collector = TopScoreDocCollector.Create(maxHits, true);
searcher.Search(combined, collector);
var hits = collector.TopDocs().ScoreDocs;
foreach (var t in hits)
{
var hitDoc = searcher.Doc(t.Doc);
var indexDoc = new DocumentWrapper();
foreach (var field in hitDoc.GetFields())
{
if (field.Name == "Score")
{
indexDoc.Contents.Add(field.Name, t.Score);
}
else
{
indexDoc.Contents.Add(field.Name, hitDoc.Get(field.Name));
}
}
matchedDocuments.Add(indexDoc);
}
}
catch
{
//TODO - Logging?
}
return matchedDocuments;
}