solrj - Spring Solr 拼写检查器组件以匹配壁橱单词

Question

我是 Solr 的新手。我正在使用 spring 3.x 和 Solr 4.6

下面是我的Schema.xml

<schema name="customer_site_address" version="1.5">

<fields>
    <field name="id" type="long" indexed="true" stored="true" required="true" multiValued="false" />
    <field name="name" type="string" indexed="false" stored="true" required="true" multiValued="false"/>
    <field name="number" type="string" indexed="false" stored="true" required="true" multiValued="false"/>
    <field name="address" type="text_general" indexed="false" stored="true" required="true" multiValued="false"/>
    <field name="city" type="string" indexed="false" stored="true" multiValued="false"/>
    <field name="state" type="string" indexed="false" stored="true" multiValued="false"/>
    <field name="zipcode" type="string" indexed="false" stored="true" multiValued="false"/>
    <field name="country" type="string" indexed="false" stored="true" multiValued="false"/>
    <field name="latlng" type="string" indexed="false" stored="true" multiValued="false"/>
    <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
    <field name="_version_" type="long" indexed="true" stored="true"/>
</fields>

<!-- Configure unique key -->
<uniqueKey>id</uniqueKey>
<copyField source="name" dest="text"/>
<copyField source="number" dest="text"/>
<copyField source="address" dest="text"/>
<copyField source="city" dest="text"/>
<copyField source="state" dest="text"/>
<copyField source="zipcode" dest="text"/>
<copyField source="country" dest="text"/>

<types>
    <!-- Long -->
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0" />
    <!-- String -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
    <!-- Text -->
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
        <!-- Configures the analysis done at the index phase -->
        <analyzer type="index">
            <!-- Uses word break rules of the Unicode Text Segmentation algorith 
                when splitting text into words. -->
            <tokenizer class="solr.StandardTokenizerFactory" />
            <!-- <tokenizer class="solr.KeywordTokenizerFactory"/> -->
            <!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
            <!-- Removes words found from stopwords.txt file. This filter is case 
                insensitive. -->
            <filter class="solr.StopFilterFactory" ignoreCase="true"
                words="stopwords.txt" enablePositionIncrements="true" />

            <filter class="solr.PatternReplaceFilterFactory" pattern="'" replacement="" replace="all" />
            <filter class="solr.WordDelimiterFilterFactory"
                 generateWordParts="1" 
                 generateNumberParts="1" 
                 catenateWords="1" 
                 catenateNumbers="1" 
                 catenateAll="0" 
                 splitOnCaseChange="1"
            />
            <!-- Transforms text to lower case -->
            <filter class="solr.LowerCaseFilterFactory" />

            <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/>
        </analyzer>
        <!-- Configures the analysis done at the query time -->
        <analyzer type="query">
            <!-- Uses word break rules of the Unicode Text Segmentation algorith 
                when splitting text into words. -->
            <tokenizer class="solr.StandardTokenizerFactory" />
            <!-- <tokenizer class="solr.KeywordTokenizerFactory"/> -->
            <!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
            <!-- Removes words found from stopwords.txt file. This filter is case 
                insensitive. -->
            <filter class="solr.StopFilterFactory" ignoreCase="true"
                words="stopwords.txt" enablePositionIncrements="true" />
            <!-- Applies synonyms found from the synonyms.txt file. -->
            <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
                ignoreCase="true" expand="true" />
            <!-- Transforms text to lower case -->
            <filter class="solr.LowerCaseFilterFactory" />

            <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/>
        </analyzer>
    </fieldType>
</types>

下面是SearchComponent在solrconfig.xml

<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<str name="queryAnalyzerFieldType">text_general</str>

<lst name="spellchecker">
    <str name="name">default</str>
    <str name="field">text</str>
    <str name="classname">solr.DirectSolrSpellChecker</str>
    <str name="distanceMeasure">internal</str>
    <float name="accuracy">0.5</float>
    <int name="maxEdits">2</int>
    <int name="minPrefix">1</int>
    <int name="maxInspections">5</int>
    <int name="minQueryLength">4</int>
    <float name="maxQueryFrequency">0.01</float>
</lst>

<!-- a spellchecker that can break or combine words.  See "/spell" handler below for usage -->
<lst name="spellchecker">
    <str name="name">wordbreak</str>
    <str name="classname">solr.WordBreakSolrSpellChecker</str>      
    <str name="field">text</str>
    <str name="combineWords">true</str>
    <str name="breakWords">true</str>
    <int name="maxChanges">10</int>
</lst>

下面是RequestHandler在solrconfig.xml

<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
<lst name="defaults">
    <str name="df">text</str>
    <str name="spellcheck.dictionary">default</str>
    <str name="spellcheck.dictionary">wordbreak</str>
    <str name="spellcheck">on</str>
    <str name="spellcheck.extendedResults">true</str>       
    <str name="spellcheck.count">10</str>
    <str name="spellcheck.alternativeTermCount">5</str>
    <str name="spellcheck.maxResultsForSuggest">5</str>       
    <str name="spellcheck.collate">true</str>
    <str name="spellcheck.collateExtendedResults">true</str>  
    <str name="spellcheck.maxCollationTries">10</str>
    <str name="spellcheck.maxCollations">5</str>         
</lst>
<arr name="last-components">
  <str>spellcheck</str>
</arr>

现在，当我调用/spell一个frgo生成以下查询字符串的城市时。

qt=/spell&spellcheck.q=frgo&spellcheck=true&mm=100%

我得到以下替代方案：（当前结果）

[f r, f r g, fargo, f r g o, farg]

它应该给我：（预期）

[fargo]

wset frgo当我输入时生成以下查询字符串的另一种情况

qt=/spell&spellcheck.q=wset+frgo&spellcheck=true&mm=100%

我得到以下替代方案：（当前结果）

[w s, w s e, west, w s e t, wert, f r, f r g, fargo, f r g o, farg]

它应该给我：（预期）

[West, West Fargo]

我确实应用了通过谷歌找到的一些解决方案，但我认为我在配置时犯了错误。我也尝试过使用solr.KeywordTokenizerFactoryand solr.WhitespaceTokenizerFactoryforindex和queryanalyzer。

请指导我。

编辑：

我删除了下面的过滤器，它在某些时候起作用。

<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/>

现在，如果我输入wset frgo生成下面的查询字符串

qt=/spell&spellcheck.q=wset+frgo&spellcheck=true

我得到以下替代方案：（当前结果）

[west, fargo, farg]

它应该给我：（预期）

[west, fargo, west fargo]

仍然West Fargo没有返回。

请指导我

谢谢

score 0 · Accepted Answer

0

如果 West Fargo 是索引，请检查 Solr Admin 中的字段。

于 2014-02-10T12:56:02.780 回答

score 0 · Accepted Answer

在你的 wordbreak 拼写检查器配置中更改此配置

<str name="breakWords">true</str>

至

<str name="breakWords">false</str>

或者您可以从“/spell”处理程序中完全删除 wordbreak 拼写检查器。

solrj - Spring Solr 拼写检查器组件以匹配壁橱单词

2 回答 2

Related

Reference