我使用 ElasticsearchN-gram tokenizer
并用于match_phrase
模糊匹配我的索引和测试数据,如下所示:
DELETE /m8
PUT m8
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 3,
"custom_token_chars":"_."
}
}
},
"max_ngram_diff": 10
},
"mappings": {
"table": {
"properties": {
"dataSourceId": {
"type": "long"
},
"dataSourceType": {
"type": "integer"
},
"dbName": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
PUT /m8/table/1
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm.rf"
}
PUT /m8/table/2
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rm_rf"
}
PUT /m8/table/3
{
"dataSourceId":1,
"dataSourceType":2,
"dbName":"rmrf"
}
检查_分析:
POST m8/_analyze
{
"tokenizer": "my_tokenizer",
"text": "rm.rf"
}
_分析结果:
{
"tokens" : [
{
"token" : "r",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "rm",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "rm.",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "m",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 3
},
{
"token" : "m.",
"start_offset" : 1,
"end_offset" : 3,
"type" : "word",
"position" : 4
},
{
"token" : "m.r",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 5
},
{
"token" : ".",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 6
},
{
"token" : ".r",
"start_offset" : 2,
"end_offset" : 4,
"type" : "word",
"position" : 7
},
{
"token" : ".rf",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 8
},
{
"token" : "r",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 9
},
{
"token" : "rf",
"start_offset" : 3,
"end_offset" : 5,
"type" : "word",
"position" : 10
},
{
"token" : "f",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 11
}
]
}
当我搜索“rm”时,什么也没找到:
GET /m8/table/_search
{
"query": {
"bool": {
"must": [
{
"match_phrase": {
"dbName": "rm"
}
}
]
}
}
}
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
但是可以找到“.rf”:
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 1.7260926,
"hits" : [
{
"_index" : "m8",
"_type" : "table",
"_id" : "1",
"_score" : 1.7260926,
"_source" : {
"dataSourceId" : 1,
"dataSourceType" : 2,
"dbName" : "rm.rf"
}
}
]
}
}
我的问题:为什么即使 _analyze 拆分了这些短语也找不到“rm”?