elasticsearch - Elasticsearch `Procter&Gamble` 和 `Procter & Gamble` 问题

Question

我的任务是： * 制作procter&gamble并procter & gamble 产生相同的结果，包括分数 * 使其具有通用性，而不是通过同义词，因为它可以是任何其他Somehow&Somewhat * 突出显示procter&gamble或procter & gamble，如果短语匹配，则不是单独的标记 * 我想使用simple_query_string，因为我允许搜索运算符 *也AT&T可搜索

这是我的片段。procter&gamble或procter & gamble搜索产生不同分数的问题和这个不同的文件作为结果。但用户期望procter&gamble或得到相同的结果procter & gamble

DELETE /english_example
PUT /english_example
{
  "settings": {
    "analysis": {
      "filter": {
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_" 
        },
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["example"] 
        },
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"
        },
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"
        },
        "acronymns": {
          "type": "word_delimiter_graph",
          "catenate_all" : true,
          "preserve_original":true
        },
        "acronymns_": {
          "type": "word_delimiter_graph",
          "catenate_all" : true,
          "preserve_original":true
        },
        "custom_stop_words_filter": {
          "type": "stop",
          "ignore_case": true,
          "stopwords": [ "t" ]
        }

      },
      "analyzer": {
        "default": {
          "tokenizer":  "whitespace",
          "char_filter": [
           "ampersand_filter"
          ],
          "filter": [
            "english_possessive_stemmer",
            "lowercase",
            "acronymns",
            "flatten_graph",
            "english_stop",
            "custom_stop_words_filter",
            "english_keywords",
            "english_stemmer"
          ]
        }
      },
      "char_filter": {
        "ampersand_filter": {
          "type": "pattern_replace",
          "pattern": "(?=[^&]*)( {0,}& {0,})(?=[^&]*)",
          "replacement": "_and_"
        },
        "ampersand_filter2": {
          "type": "mapping",
          "mappings": [
            "& => _and_"
          ]
        }
      }
    }
  }
}
PUT /english_example/_bulk 
{ "index" : { "_id" : "1" } }
{ "description" : "wi-fi AT&T BB&T Procter & Gamble, some\nOther $500 games with Peter's", "contents" : "Much text with somewhere I meet Procter or Gamble" }
{ "index" : { "_id" : "2" } }
{ "description" : "Procter & Gamble", "contents" : "Much text with somewhere I meet Procter and Gamble" }
{ "index" : { "_id" : "3" } }
{ "description" : "Procter&Gamble", "contents" : "Much text with somewhere I meet Procter & Gamble" }
{ "index" : { "_id" : "4" } }
{ "description" : "Come Procter&Gamble", "contents" : "Much text with somewhere I meet Procter&Gamble" }
{ "index" : { "_id" : "5" } }
{ "description" : "Tome Procter & Gamble", "contents" : "Much text with somewhere I don't meet AT&T" }


# "query": "procter & gamble",
GET english_example/_search
{
    "query": {
      "simple_query_string": {
          "query": "procter & gamble",
          "default_operator": "or",
          "fields": [
            "description^2",
            "contents^80"
          ]
      }
    },
    "highlight": {
      "fields": {
        "description": {},
        "contents": {}
      }
    }
}


# "query": "procter&gamble",
GET english_example/_search
{
    "query": {
      "simple_query_string": {
          "query": "procter&gamble",
          "default_operator": "or",
          "fields": [
            "description^2",
            "contents^80"
          ]
      }
    },
    "highlight": {
      "fields": {
        "description": {},
        "contents": {}
      }
    }
}


# "query": "at&t",
GET english_example/_search
{
    "query": {
      "simple_query_string": {
          "query": "at&t",
          "default_operator": "or",
          "fields": [
            "description^2",
            "contents^80"
          ]
      }
    },
    "highlight": {
      "fields": {
        "description": {},
        "contents": {}
      }
    }
}

在我的代码片段中，我使用word_delimiter_graph和whitespace标记器重新定义了默认分析器来搜索AT&T匹配项。

score 0 · Accepted Answer

我刚刚意识到您正在搜索描述字段而不是公司字段。所以关键字分析器不起作用。我已经相应地更新了我的答案。

您可以尝试使用小写和空格分析器添加自定义字段，并使用相同的自定义分析器进行搜索。执行搜索时，在标准字段和此自定义字段中作为多重匹配搜索进行搜索。那应该允许您同时支持两者。您可以提高自定义字段的分数，以便完全匹配出现在搜索结果的顶部。

诀窍是在执行搜索之前将用户输入转换为小写。您不应该按原样使用用户输入。否则这种方法行不通。

您可以使用以下脚本进行尝试。

DELETE /test1
PUT /test1
{
  "settings": {
    "analysis": {
      "analyzer": {
        "lowercase_analyzer" : {
          "filter" : ["lowercase"],
          "type" : "custom",
          "tokenizer" : "whitespace"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "description" : {
        "type": "text",
        "analyzer": "standard",
        "fields": {
          "custom" : {
            "type" : "text",
            "analyzer" : "lowercase_analyzer",
            "search_analyzer" : "lowercase_analyzer"
          }
        }
      }
    }
  }
}

PUT /test1/_bulk
{ "index" : { "_id" : "1" } }
{ "description" : "wi-fi AT&T BB&T Procter & Gamble, some\nOther $500 games with Peter's" }
{ "index" : { "_id" : "2" } }
{ "description" : "Procter & Gamble" }
{ "index" : { "_id" : "3" } }
{ "description" : "Procter&Gamble" }

GET test1/_search
{
  "query": {
    "multi_match": {
      "query": "procter&gamble",
      "fields": ["description", "description.custom"]
    }
  },
  "highlight": {
    "fields": {
      "description": {},
      "description.custom": {}
    }
  }
}


GET test1/_search
{
  "query": {
    "multi_match": {
      "query": "procter",
      "fields": ["description", "description.custom"]
    }
  },
  "highlight": {
    "fields": {
      "description": {},
      "description.custom": {}
    }
  }
}

GET test1/_search
{
  "query": {
    "multi_match": {
      "query": "at&t",
      "fields": ["description", "description.custom"]
    }
  },
  "highlight": {
    "fields": {
      "description": {},
      "description.custom": {}
    }
  }
}

GET test1/_search
{
  "query": {
    "multi_match": {
      "query": "procter & gamble",
      "fields": ["description", "description.custom"]
    }
  },
  "highlight": {
    "fields": {
      "description": {},
      "description.custom": {}
    }
  }
}

您可以添加突出显示并尝试一下。

score 0 · Accepted Answer

我能想到的一个选择是使用“标准分析器”和您的自定义分析器的应该查询。

对于使用自定义和标准分析器生成的“proctor & gamble”令牌将是“proctor”、“gamble”对于使用自定义分析器生成的“proctor&gamble”令牌将是“proctor”、“gamble”、“proctor&gamble”并且使用标准分析器将“监考”和“赌博”

所以在 should 子句中，我们可以使用标准分析器来查找“proctor”或“gamble”，并使用自定义分析器来查找“proctor&gamble”

GET english_example/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "match": {
            "description": {
              "query": "Procter&Gamble",
              "analyzer": "standard"
            }
          }
        },
        {
          "match": {
            "description": {
              "query": "Procter&Gamble"
            }
          }
        }
      ],
      "minimum_should_match": 1
    }
  }
}

第二种选择是使用同义词，您可以在其中定义所有变体，其中 proctor 和 gamble 似乎意味着单一事物

elasticsearch - Elasticsearch `Procter&Gamble` 和 `Procter & Gamble` 问题

2 回答 2

Related

Reference