1
PUT /new_index/
{
    "settings": {
        "index": {
            "type": "default"
        },
        "number_of_shards": 5,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "ap_stop": {
                    "type": "stop",
                    "stopwords_path": "stoplist.txt"
                },
                "shingle_filter" : {
                    "type" : "shingle",
                    "min_shingle_size" : 2,
                    "max_shingle_size" : 5,
                    "output_unigrams": true
                }
            },
        "analyzer": {
             "aplyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["standard",
                           "ap_stop",
                           "lowercase",
                           "shingle_filter",
                           "snowball"]
                }
            }
        }
    }
}

PUT /new_index/document/_mapping/
{
    "document": {
        "properties": {
            "text": {
                "type": "string",
                "store": true,
                "index": "analyzed",
                "term_vector": "with_positions_offsets_payloads",
                "search_analyzer": "aplyzer",
                "index_analyzer": "aplyzer"
            },
            "original_text": {
                "include_in_all": false,
                "type": "string",
                "store": false,
                "index": "not_analyzed"
            },
            "docid": {
                "include_in_all": false,
                "type": "string",
                "store": true,
                "index": "not_analyzed"  
            }
        }
    }
}

我需要将上述索引转换settings为. 我正在使用最新的和.mappingselastic4selastic4selasticsearch 1.5.2

我浏览了文档中给出的一些示例,但我无法弄清楚如何做到这一点,就像我试图以这种方式创建它一样:

client.execute {
    create index "new_index" mappings {
      "documents" as (
        "text" typed StringType analyzer ...
        )
    }
  }

我无法弄清楚如何使用PUT 请求中给出的store,index等。term_vectors

更新: 根据答案,我能够做出这样的事情:

create index "new_index" shards 5 replicas 1 refreshInterval "90s"  mappings {
    "documents" as(
      id typed StringType analyzer KeywordAnalyzer store true includeInAll false,
      "docid" typed StringType index "not_analyzed" store true includeInAll false,
      "original_text" typed StringType index "not_analyzed" includeInAll false,
      "text" typed StringType analyzer CustomAnalyzer("aplyzer") indexAnalyzer "aplyzer" searchAnalyzer "aplyzer" store true termVector WithPositionsOffsetsPayloads
      )
  } analysis (
    CustomAnalyzerDefinition(
      "aplyzer",
      StandardTokenizer,
      LowercaseTokenFilter,
      shingle tokenfilter "shingle_filter" minShingleSize 2 maxShingleSize 5 outputUnigrams true
    )
  )

我现在无法弄清楚的是如何将雪球词干分析器和停用词文件路径添加到aplyzer分析器?

我应该怎么做?

4

2 回答 2

1

您的标题询问自定义过滤器,但您的问题正文询问有关storeindexterm_vectors. 我将解释后者。

  client.execute {
    create index "myindex" mappings {
      "mytype" as (
        "myfield" typed StringType store true termVector termVector.WithOffsets index "not_analyzed"
        )
      )
    }
  }

更新

根据您更新的问题。elasticsearch 文档不清楚是否可以在雪球标记过滤器上设置停用词。你可以在雪球分析仪上。

所以,要么

SnowballAnalyzerDefinition("mysnowball", "English", stopwords = Set("I", "he", "the"))

或者

CustomAnalyzerDefinition("mysnowball",
  StandardTokenizer,
  LowercaseTokenFilter,
  snowball tokenfilter "snowball1" language "German"
)
于 2015-06-03T18:11:11.407 回答
1

根据@monkjack 的建议以及我从elastic4s的文档中读到的内容,我终于想出了以下答案,即与elastic4s. 浏览作者为 API 编写的测试。

create index "new_index" shards 5 replicas 1 refreshInterval "90s" mappings {
    "documents" as(
      id
        typed StringType
        analyzer KeywordAnalyzer
        store true
        includeInAll false,
      "docid"
        typed StringType
        index "not_analyzed"
        store true
        includeInAll false,
      "original_text"
        typed StringType
        index "not_analyzed"
        includeInAll false,
      "text"
        typed StringType
        analyzer CustomAnalyzer("aplyzer")
        indexAnalyzer "aplyzer"
        searchAnalyzer "aplyzer"
        store true
        termVector WithPositionsOffsetsPayloads
      )
  } analysis (
    CustomAnalyzerDefinition(
      "aplyzer",
      StandardTokenizer,
      LowercaseTokenFilter,
      NamedStopTokenFilter("ap_stop", "_english_", true, true),
      shingle
        tokenfilter "shingle_filter"
        minShingleSize 2
        maxShingleSize 5
        outputUnigrams true
        outputUnigramsIfNoShingles true,
      snowball
        tokenfilter "ap_snowball"
        lang "English"
    )
  )

如果您想提供自己的停用词列表,请使用StopTokenFilter("ap_stop", stopwords = Set("a", "an", "the"))代替。NamedStopTokenFilter

GET new_index当我在 Sense 中运行时,我得到以下设置/映射。

{
   "new_index": {
      "aliases": {},
      "mappings": {
         "documents": {
            "properties": {
               "docid": {
                  "type": "string",
                  "index": "not_analyzed",
                  "store": true,
                  "include_in_all": false
               },
               "original_text": {
                  "type": "string",
                  "index": "not_analyzed",
                  "include_in_all": false
               },
               "text": {
                  "type": "string",
                  "store": true,
                  "term_vector": "with_positions_offsets_payloads",
                  "analyzer": "aplyzer"
               }
            }
         }
      },
      "settings": {
         "index": {
            "creation_date": "1433383476240",
            "uuid": "6PmqlY6FRPanGtVSsGy3Jw",
            "analysis": {
               "analyzer": {
                  "aplyzer": {
                     "type": "custom",
                     "filter": [
                        "lowercase",
                        "ap_stop",
                        "shingle_filter",
                        "ap_snowball"
                     ],
                     "tokenizer": "standard"
                  }
               },
               "filter": {
                  "ap_stop": {
                     "enable_position_increments": "true",
                     "ignore_case": "true",
                     "type": "stop",
                     "stopwords": "_english_"
                  },
                  "shingle_filter": {
                     "output_unigrams_if_no_shingles": "true",
                     "token_separator": " ",
                     "max_shingle_size": "5",
                     "type": "shingle",
                     "min_shingle_size": "2",
                     "filler_token": "_",
                     "output_unigrams": "true"
                  },
                  "ap_snowball": {
                     "type": "snowball",
                     "language": "English"
                  }
               }
            },
            "number_of_replicas": "1",
            "number_of_shards": "5",
            "refresh_interval": "90s",
            "version": {
               "created": "1050299"
            }
         }
      },
      "warmers": {}
   }
}

如果您想要作为单独的分析器StopWordsStemmers正如@monkjack 建议的那样,只需添加SnowballAnalyzerDefinitionStopAnalyzerDefinition喜欢:

....outputUnigramsIfNoShingles true,
    ),
    SnowballAnalyzerDefinition("ap_snowball", "English"),
    StopAnalyzerDefinition("ap_stop", stopwords = Set("a", "an", "the"))
  )
于 2015-06-04T02:22:40.520 回答