0

谁能告诉我为什么“asciifolding”在我下面的映射中的“模式”标记器上不起作用?

我需要使用“模式”标记器,但我也不需要区分带有重音或没有“asciifolding”所做的重音功能的单词。

我需要“televisão”等于“televisao”,但“asciifolding”不适用于具有“asciifolding”和标记器“模式”的“analyzer_customizado”

{
  "settings": {
    "index": {
      "number_of_shards": "5",
      "number_of_replicas": "0",
      "analysis": {
        "filter": {
          "stemmer_plural_portugues": {
            "name": "minimal_portuguese",
            "stopwords" : ["http", "https", "ftp", "www"],
            "type": "stemmer"
          },
          
          
            "synonym_filter": {
            "type": "synonym",
            "lenient": true,
            "synonyms_path": "analysis/synonym.txt",
            "updateable" : true

          },
          
       
          "shingle_filter": {
            "type": "shingle",
            "min_shingle_size": 2,
            "max_shingle_size": 3
          }

        },
        
        "analyzer": {
          "analyzer_customizado": {
            "filter": [
              "lowercase",
              "stemmer_plural_portugues",
              "asciifolding",
              "synonym_filter",
              "shingle_filter"
              
            ],
            "tokenizer": "pattern"
          }
        }

      }
    }
  },
  "mappings": {
      "properties": {

        "id": {
         "type": "long"
        },
         "data": {
          "type": "date"
        },
         "quebrado": {
          "type": "byte"
          
        },
         "pgrk": {
           "type":  "integer" 
        },
         "url_length": {
           "type":  "integer" 
        },
        "titulo": {
          "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "descricao": {
        "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        },
        "url": {
          "analyzer": "analyzer_customizado",
          "type": "text",
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          }
        }
      }
    }
  }

有人可以告诉我如何修复我的“asciifolding”映射以在具有标记器“模式”的“analyzer_customizado”中工作

4

2 回答 2

0

问题是由于模式标记器的官方文档中pattern提到的默认分析器

默认模式是 \W+,它会在遇到非单词字符时拆分文本。

您可以使用分析器 API 自己测试它,它会生成两个标记,televisão因为它认为ã是非单词字符。

{
    "tokenizer": "pattern",
    "text": "televisão"
}

{
    "tokens": [
        {
            "token": "televis",
            "start_offset": 0,
            "end_offset": 7,
            "type": "word",
            "position": 0
        },
        {
            "token": "o",
            "start_offset": 8,
            "end_offset": 9,
            "type": "word",
            "position": 1
        }
    ]
}

解决方案:- 不幸的是,没有ASCIIfolding char filter哪个可以将其转换为正确的 ASCII 字符,以防止它在您的模式标记器中被分解为不同的标记。您可以参考这篇讨论帖子,其中讨论了这个问题并建议使用自定义插件。

编辑正如@Val 在评论中建议的那样,您还可以使用映射 char 过滤器并定义自己的字符映射,该映射将转换为第一阶段进行分析,即 char 过滤器。

于 2020-06-26T05:32:20.303 回答
0

我在我的映射中添加了过滤器“char_filter”,并将过滤器放在我的“analyzer_customizado”中,该过滤器具有标记器“模式”,但在创建索引时出错并且没有创建

{
      "settings": {
        "index": {
          "number_of_shards": "5",
          "number_of_replicas": "0",
          "analysis": {
            "filter": {
              "stemmer_plural_portugues": {
                "name": "minimal_portuguese",
                "stopwords" : ["http", "https", "ftp", "www"],
                "type": "stemmer"
              },
              
              
                "synonym_filter": {
                "type": "synonym",
                "lenient": true,
                "synonyms_path": "analysis/synonym.txt",
                "updateable" : true
    
              },
              
           
              "shingle_filter": {
                "type": "shingle",
                "min_shingle_size": 2,
                "max_shingle_size": 3
              },
              
              
              
              
        "char_filter": [
        {
          "type": "mapping",
          "mappings": [
            "ã => a",
            "â => a",
            "à => a",
            "á => a"
          ]
        }
      ],
    
            
            
            "analyzer": {
              "analyzer_customizado": {
                "filter": [
                  "lowercase",
                  "stemmer_plural_portugues",
                  "synonym_filter",
                  "shingle_filter",
                  "char_filter"
                  
                ],
                "tokenizer": "pattern"
              }
            }
    
          }
        }
      },
      "mappings": {
          "properties": {
    
            "id": {
             "type": "long"
            },
             "data": {
              "type": "date"
            },
             "quebrado": {
              "type": "byte"
              
            },
             "pgrk": {
               "type":  "integer" 
            },
             "url_length": {
               "type":  "integer" 
            },
            "titulo": {
              "analyzer": "analyzer_customizado",
              "type": "text",
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              }
            },
            "descricao": {
            "analyzer": "analyzer_customizado",
              "type": "text",
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              }
            },
            "url": {
              "analyzer": "analyzer_customizado",
              "type": "text",
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              }
            }
          }
        }
      }
    }

以下是我尝试使用“char_filter”过滤器创建索引时出现的错误

{
  "error": {
    "root_cause": [
      {
        "type": "settings_exception",
        "reason": "Failed to load settings from [{\"mappings\":{\"properties\":{\"url_length\":{\"type\":\"integer\"},\"data\":{\"type\":\"date\"},\"pgrk\":{\"type\":\"integer\"},\"titulo\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}},\"quebrado\":{\"type\":\"byte\"},\"id\":{\"type\":\"long\"},\"url\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}},\"descricao\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}}}},\"index\":{\"number_of_shards\":\"5\",\"analysis\":{\"filter\":{\"stemmer_plural_portugues\":{\"name\":\"minimal_portuguese\",\"type\":\"stemmer\",\"stopwords\":[\"http\",\"https\",\"ftp\",\"www\"]},\"synonym_filter\":{\"updateable\":true,\"synonyms_path\":\"analysis/synonym.txt\",\"type\":\"synonym\",\"lenient\":true},\"char_filter\":[{\"mappings\":[\"ã => a\",\"â => a\",\"à => a\",\"á => a\"],\"type\":\"mapping\"}],\"analyzer\":{\"analyzer_customizado\":{\"filter\":[\"lowercase\",\"stemmer_plural_portugues\",\"char_filter\",\"synonym_filter\",\"shingle_filter\"],\"tokenizer\":\"pattern\"}},\"shingle_filter\":{\"min_shingle_size\":2,\"max_shingle_size\":3,\"type\":\"shingle\"}}},\"number_of_replicas\":\"0\"}}]"
      }
    ],
    "type": "settings_exception",
    "reason": "Failed to load settings from [{\"mappings\":{\"properties\":{\"url_length\":{\"type\":\"integer\"},\"data\":{\"type\":\"date\"},\"pgrk\":{\"type\":\"integer\"},\"titulo\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}},\"quebrado\":{\"type\":\"byte\"},\"id\":{\"type\":\"long\"},\"url\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}},\"descricao\":{\"analyzer\":\"analyzer_customizado\",\"type\":\"text\",\"fields\":{\"keyword\":{\"ignore_above\":256,\"type\":\"keyword\"}}}}},\"index\":{\"number_of_shards\":\"5\",\"analysis\":{\"filter\":{\"stemmer_plural_portugues\":{\"name\":\"minimal_portuguese\",\"type\":\"stemmer\",\"stopwords\":[\"http\",\"https\",\"ftp\",\"www\"]},\"synonym_filter\":{\"updateable\":true,\"synonyms_path\":\"analysis/synonym.txt\",\"type\":\"synonym\",\"lenient\":true},\"char_filter\":[{\"mappings\":[\"ã => a\",\"â => a\",\"à => a\",\"á => a\"],\"type\":\"mapping\"}],\"analyzer\":{\"analyzer_customizado\":{\"filter\":[\"lowercase\",\"stemmer_plural_portugues\",\"char_filter\",\"synonym_filter\",\"shingle_filter\"],\"tokenizer\":\"pattern\"}},\"shingle_filter\":{\"min_shingle_size\":2,\"max_shingle_size\":3,\"type\":\"shingle\"}}},\"number_of_replicas\":\"0\"}}]",
    "caused_by": {
      "type": "illegal_state_exception",
      "reason": "only value lists are allowed in serialized settings"
    }
  },
  "status": 500
}
于 2020-06-26T13:07:38.680 回答