0

任何人都可以分享法语数据的最佳 solr 架构吗?我已经为我当前的项目实现了这个,但它没有为法语文本提供正确的数据。

我的 schema.xml:

<schema name="example core zero" version="1.1">
  <types>
    <!-- boolean type: "true" or "false" -->
    <fieldtype name="string"  class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
    <!--
      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.

      These fields support doc values, but they require the field to be
      single-valued and either be required or have a default value.
    -->
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="0">
      <analyzer type="index">
     <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.FrenchLightStemFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="French" />
        <filter class="solr.ASCIIFoldingFilterFactory"/>
      </analyzer>
      <analyzer type="query">
     <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.FrenchLightStemFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="French" />
 <filter class="solr.ASCIIFoldingFilterFactory"/>     
    </analyzer>
    </fieldType>
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="sku" class="solr.TextField" omitNorms="false">
            <analyzer type="index">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="\s" replacement=""/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="[^a-zA-Z0-9]" replacement=""/>
                <filter class="solr.NGramFilterFactory" maxGramSize="30"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="\s" replacement=""/>
                <filter class="solr.PatternReplaceFilterFactory" pattern="[^a-zA-Z0-9]" replacement=""/>
            </analyzer>
        </fieldType>
        <fieldType name="exact" class="solr.TextField" omitNorms="false">
            <analyzer type="index">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.KeywordTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
            </analyzer>
        </fieldType>

   </types>

 <fields>   
  <!-- general -->
  <field name="id"        type="string"   indexed="true"  stored="true"  multiValued="false" required="true"/>
  <field name="type"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="name"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="core0"     type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="_version_" type="long"     indexed="true"  stored="true"/>
  <field name="sku"       type="sku"     indexed="true"  stored="true"/>

   <!-- Dynamic field definitions allow using convention over configuration
       for fields via the specification of patterns to match field names. 
       EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
       RESTRICTION: the glob-like pattern in the name attribute must have
       a "*" only at the start or the end.  -->

   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_is" type="int"    indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true" />
   <dynamicField name="*_ss" type="string"  indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_ls" type="long"   indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
   <dynamicField name="*_txt" type="text_general"   indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true" stored="true"/>
   <dynamicField name="*_bs" type="boolean" indexed="true" stored="true"  multiValued="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_fs" type="float"  indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
   <dynamicField name="*_ds" type="double" indexed="true"  stored="true"  multiValued="true"/>

   <dynamicField name="*_dt"  type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_dts" type="date"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_exact"  type="exact"    indexed="true"  stored="true"/>

   <!-- uncomment the following to ignore any fields that don't already match an existing 
        field name or dynamic field, rather than reporting them as an error. 
        alternately, change the type="ignored" to some other type e.g. "text" if you want 
        unknown fields indexed and/or stored by default --> 
   <!--dynamicField name="*" type="ignored" multiValued="true" /-->

 </fields>

 <!-- field to use to determine and enforce document uniqueness. -->
 <uniqueKey>id</uniqueKey>

 <!-- field for the QueryParser to use when an explicit fieldname is absent -->
 <defaultSearchField>name</defaultSearchField>

 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>
</schema>
4

1 回答 1

1

你是干了两次。词干并非旨在相互增强,使用两个只会给您带来不可预知的结果。选择其中一个FrenchLightStemFilter或雪球过滤器,而不是两者。

如果你不确定你想要哪个,我会说从 开始FrenchLightStemFilter,如果你觉得它太保守了,试试雪球过滤器。

您也可以采取简单的路线,并尝试使用预定义的FrenchAnalyzer,例如:

<analyzer class="org.apache.lucene.analysis.fr.FrenchAnalyzer"/>

没有ASCIIFoldingFilter,尽管我不知道在词干分析器之后是否真的有必要。

于 2014-07-24T16:31:40.523 回答