2

我一直在尝试使用 facet 来获取字段的词频。我的查询只返回一个命中,所以我想让 facet 返回在特定字段中频率最高的术语。

我的映射:

{
"mappings":{
    "document":{
        "properties":{
            "tags":{
                "type":"object",
                "properties":{
                    "title":{
                        "fields":{
                            "partial":{
                                "search_analyzer":"main",
                                "index_analyzer":"partial",
                                "type":"string",
                                "index" : "analyzed"
                            }
                            "title":{
                                "type":"string",
                                "analyzer":"main",
                                "index" : "analyzed"
                            }
                        },
                        "type":"multi_field"
                    }
                }
            }
        }
    }
},

"settings":{
    "analysis":{
        "filter":{
            "name_ngrams":{
                "side":"front",
                "max_gram":50,
                "min_gram":2,
                "type":"edgeNGram"
            }
        },

        "analyzer":{
            "main":{
                "filter": ["standard", "lowercase", "asciifolding"],
                "type": "custom",
                "tokenizer": "standard"
            },
            "partial":{
                "filter":["standard","lowercase","asciifolding","name_ngrams"],
                "type": "custom",
                "tokenizer": "standard"
            }
        }
    }
}

}

测试数据:

 curl -XPUT localhost:9200/testindex/document -d '{"tags": {"title": "people also kill people"}}'

询问:

 curl -XGET 'localhost:9200/testindex/document/_search?pretty=1' -d '
{
    "query":
    {
       "term": { "tags.title": "people" }
    },
    "facets": {
       "popular_tags": { "terms": {"field": "tags.title"}}
    }
}'

这个结果

"hits" : {
   "total" : 1,
    "max_score" : 0.99381393,
    "hits" : [ {
    "_index" : "testindex",
    "_type" : "document",
    "_id" : "uI5k0wggR9KAvG9o7S7L2g",
    "_score" : 0.99381393, "_source" : {"tags": {"title": "people also kill people"}}
 } ]
},
"facets" : {
  "popular_tags" : {
  "_type" : "terms",
  "missing" : 0,
  "total" : 3,
  "other" : 0,
  "terms" : [ {
    "term" : "people",
    "count" : 1            // I expect this to be 2
   }, {
    "term" : "kill",
    "count" : 1
  }, {
    "term" : "also",
    "count" : 1
  } ]
}

}

上面的结果不是我想要的。我想让频率计数为 2

"hits" : {
   "total" : 1,
   "max_score" : 0.99381393,
   "hits" : [ {
   "_index" : "testindex",
   "_type" : "document",
   "_id" : "uI5k0wggR9KAvG9o7S7L2g",
   "_score" : 0.99381393, "_source" : {"tags": {"title": "people also kill people"}}
} ]
},
"facets" : {
"popular_tags" : {
  "_type" : "terms",
  "missing" : 0,
  "total" : 3,
  "other" : 0,
  "terms" : [ {
    "term" : "people",
    "count" : 2            
  }, {
    "term" : "kill",
    "count" : 1
  }, {
    "term" : "also",
    "count" : 1
  } ]
 }
}

我如何实现这一目标?刻面是错误的方式吗?

4

2 回答 2

6

一个方面计算文档,而不是属于它们的术语。您得到 1,因为只有一个文档包含该术语,无论发生多少次都没有关系。我不知道返回术语频率的开箱即用方式,方面不是一个好的选择。
如果您启用术语向量,该信息可以存储在索引中,但目前无法从弹性搜索中读取术语向量。

于 2012-11-05T12:12:20.060 回答
0

不幸的是,Elastic 中没有字段的术语频率。GitHub 项目Index TermList正在使用 Lucene 的术语并计算所有文档的出现总数,您可以检查它并根据您的需要进行替换。

于 2015-11-05T13:19:14.647 回答