2

我有一个索引存储,其中包含各种短语,但有些拼写错误,例如:-

"Hallo World"
"Qick Brown Fox"
"Long tame no sees"

我正在使用布尔查询运行模糊搜索,但如果有匹配项,我不知道如何检索匹配的短语。我不只是想要从索引存储中轻松获得的短语。

例如,如果我的搜索是(“Hello World”或“Keep it Real”或“Long Time no See”)的布尔查询,它将返回“Hallo World”和“Long tame no sees”。但是我还需要知道我原来的短语“Hello World”和“Long Time no See”是匹配的。有任何想法吗?我的搜索方法如下(当前返回短语匹配但拼写错误):-

public IEnumerable<DocumentWrapper> Search(List<string> fieldsToSearch, List<string> phrases, string path)
{

    var luceneDir = FSDirectory.Open(new DirectoryInfo(path));

    if (luceneDir == null)
    {
        Logging.Error("Invalid Index Directory during Search");
        return null;
    }
    Logging.Information(string.Format("Searching for phrase(s): {0}", phrases.ToSeparatedList()));


    var analyzer = new StandardAnalyzer(Version.LUCENE_30);
    IndexReader indexReader;
    try
    {
        indexReader = IndexReader.Open(luceneDir, true);
    }
    catch (FileNotFoundException)
    {
        Logging.Error("Lucene Directory has no items");
        return null;
    }


    var matchedDocuments = new List<DocumentWrapper>();
    try
    {
        var searcher = new IndexSearcher(indexReader);
        MultiFieldQueryParser queryParser = null;
        if (fieldsToSearch != null)
        {
            queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fieldsToSearch.ToArray(), analyzer);
        }
        else
        {
            queryParser = new MultiFieldQueryParser(Version.LUCENE_30,
                                                    indexReader.GetFieldNames(
                                                        IndexReader.FieldOption.INDEXED_NO_TERMVECTOR).ToArray(),
                                                    analyzer);
        }

        //Add a fuzzy search using a tilde on each word in the phrase then 'AND' them together.
        var booleanQueries = new List<BooleanQuery>();
        foreach (var phrase in phrases)
        {
            var terms = phrase.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries);
            var booleanQuery = new BooleanQuery();
            foreach (var term in terms)
            {
                booleanQuery.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.MUST);
                    //'AND' the words in the phrase together.
            }
            booleanQueries.Add(booleanQuery);
        }

        //combine our boolean queries with 'OR' i.e. (word1 and word2 and word3) OR (word4 and word5) OR (word 6)
        var combined = new BooleanQuery();
        foreach (var query in booleanQueries)
        {
            combined.Add(query, Occur.SHOULD);
        }


        var maxHits = 100; //TODO - configurable
        var collector = TopScoreDocCollector.Create(maxHits, true);
        searcher.Search(combined, collector);
        var hits = collector.TopDocs().ScoreDocs;


        foreach (var t in hits)
        {
            var hitDoc = searcher.Doc(t.Doc);
            var indexDoc = new DocumentWrapper();
            foreach (var field in hitDoc.GetFields())
            {
                if (field.Name == "Score")
                {
                    indexDoc.Contents.Add(field.Name, t.Score);
                }
                else
                {
                    indexDoc.Contents.Add(field.Name, hitDoc.Get(field.Name));
                }
            }
            matchedDocuments.Add(indexDoc);
        }
    }
    catch
    {
        //TODO - Logging?
    }

    return matchedDocuments;
}
4

0 回答 0