python - 在进行批量索引时无法删除弹性搜索中的停用词

Question

我的代码如下。即使我已将 ["is","it","possible"] 列为停用词过滤器，但仍然在输出搜索中得到它。有人可以帮助解释为什么弹性搜索在索引时没有从输入文档中删除它们吗？

issue_with_stop_word.csv 如下

id,qid1,qid2,question1
5,11,12,How do I recover my Facebook login password?
7,15,16,Is it possible to sleep without dreaming?
11,23,24,How easy is it to hack the login password of a Macbook Air?
12,25,26,How easy is it to hack the login password of a Macbook Air?
13,27,28,Is it possible to know who visited my Facebook profile?
15,31,32,Is it possible to know who visited my Facebook profile?
16,33,34,Is it possible to know who visited my Facebook profile?
18,37,38,Is it possible to hack someone's Facebook messages?
20,41,42,Is it possible to know who visited my Facebook profile?
29,59,60,How do I recover my Facebook password without having to reset it?
31,63,64,What are some special cares for someone with a nose that gets stuffy during the night?
32,65,66,What Game of Thrones villain would be the most likely to give you mercy?

代码如下

from elasticsearch import Elasticsearch
from elasticsearch import helpers
query='Is it possible ?'

index_name = 'sample'
doc_type = 'dummy'
content = 'content'
document = 'question'
identity = 'id'



def main():
    es = Elasticsearch('localhost:9200')
    create_indices(es, index_name)
    res = es.search(index=index_name, doc_type=doc_type,
                    body={
                        "query": {
                            "match": {
                               'content': "is it possible"
                            }
                        }
                    })
    print("%d documents found:" % len(res['hits']['hits']))
    for doc in res['hits']['hits']:
        print("%s) %s %s" % (doc['_id'], doc['_source']['content'], str(doc['_score'])))


def create_indices(es, index_name):
    bulk_data = []
    with open('issue_with_stop_word.csv', 'rb') as tsvin:
        tsvin.next()
        for row in tsvin:
            row = unicode(row, errors='replace')
            doc = str(row.split(',')[3]).strip()
            int_id = int(row.split(',')[1])
            value = dict()
            value[content] = doc
            value[identity] = int_id
            bulk_data.append(value)

    if es.indices.exists(index_name):
        print("deleting '%s' index..." % (index_name))
        res = es.indices.delete(index=index_name)
        print(" response: '%s'" % (res))
    # since we are running locally, use one shard and no replicas
    request_body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "analysis": {
                        "filter": {
                            "my_stop": {
                                "type": "stop",
                                "stopwords": ["is","it","possible"]
                            }
                        }
            }
        }
    }

    print("creating '%s' index..." % (index_name))
    res = es.indices.create(index=index_name, body=request_body)
    print(" response: '%s'" % (res))

    # bulk index the data
    print("bulk indexing...")

    actions = [ 
        {
            "_index": index_name,
            "_type" : doc_type,
            "_id": val[identity],
            content:val[content]
        }
        for val in bulk_data
    ]
    res = helpers.bulk(es, actions, refresh = True)

if __name__ == '__main__':
    main()

score 2 · Accepted Answer

我可能在这里误解了您的问题，但我认为您可能会误解过滤器的目的。

过滤器所属的分析器在您发送到 elasticsearch 的消息的实际正文被存储以供以后检索之前不起作用。elasticsearch 所做的是它创建了一个倒排索引，用于存储消息中的单个单词（或标记）。这是您以后可以搜索的内容。为了检索文档的实际文本，它被原封不动地存储到_source字段中。

下面这张来自我不久前的演示文稿的图片可能有助于理解这个概念：

在您的情况下，如果您检索实际文档，您将获得未更改的输入消息，但是如果您尝试搜索“is”或“it”，则不会返回任何结果。

在这种情况下，您的问题是，您没有将创建的过滤器分配给包含您的文本（内容）的字段 - 这导致 Elasticsearch 使用标准分析器而不是您的停用词。

当我按如下方式创建索引时，它会为我显示预期的行为：

PUT 127.0.0.1:9200/stacktest
{
    "settings": {
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "type": "standard",
                    "stopwords": [
                        "is",
                        "it",
                        "possible"
                    ]
                }
            }
        }
    },
    "mappings": {
        "question": {
            "properties": {
                "content": {
                    "type": "text",
                    "analyzer": "my_analyzer"
                }
            }
        }
    }
}

POST 127.0.0.1:9200/_bulk
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":5, "qid1":11, "qid2":12, "content": "How do I recover my Facebook login password?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":7, "qid1":15, "qid2":16,"content": "Is it possible to sleep without dreaming?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":11, "qid1":23, "qid2":24, "content": "How easy is it to hack the login password of a Macbook Air?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":12, "qid1":25, "qid2":26, "content": "How easy is it to hack the login password of a Macbook Air?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":13, "qid1":27, "qid2":28, "content": "Is it possible to know who visited my Facebook profile?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":15, "qid1":31, "qid2":32, "content": "Is it possible to know who visited my Facebook profile?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":16, "qid1":33, "qid2":34, "content": "Is it possible to know who visited my Facebook profile?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":18, "qid1":37, "qid2":38, "content": "Is it possible to hack someone's Facebook messages?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":20, "qid1":41, "qid2":42, "content": "Is it possible to know who visited my Facebook profile?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":29, "qid1":59, "qid2":60, "content": "How do I recover my Facebook password without having to reset it?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":31, "qid1":63, "qid2":64, "content": "What are some special cares for someone with a nose that gets stuffy during the night?"}
{ "index" : { "_index" : "stacktest", "_type" : "question" } }
{"id":32, "qid1":65, "qid2":66, "content": "What Game of Thrones villain would be the most likely to give you mercy?"}

查询停用词

GET 127.0.0.1:9200/stacktest/_search
{
    "query": {
        "match": {
            "content": "is"
        }
    }
}

{
    "took": 1,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 0,
        "max_score": null,
        "hits": []
    }
}

查询其他单词

GET 127.0.0.1:9200/stacktest/_search
{
    "query": {
        "match": {
            "content": "how"
        }
    }
}

{
    "took": 4,
    "timed_out": false,
    "_shards": {
        "total": 5,
        "successful": 5,
        "failed": 0
    },
    "hits": {
        "total": 4,
        "max_score": 0.49191087,
        "hits": [
            {
                "_index": "stacktest",
                "_type": "question",
                "_id": "AVpGFiGBW2Sd8hDDcFxg",
                "_score": 0.49191087,
                "_source": {
                    "id": 12,
                    "qid1": 25,
                    "qid2": 26,
                    "content": "How easy is it to hack the login password of a Macbook Air?"
                }
            },
            {
                "_index": "stacktest",
                "_type": "question",
                "_id": "AVpGFiGBW2Sd8hDDcFxd",
                "_score": 0.4375115,
                "_source": {
                    "id": 5,
                    "qid1": 11,
                    "qid2": 12,
                    "content": "How do I recover my Facebook login password?"
                }
            },
            {
                "_index": "stacktest",
                "_type": "question",
                "_id": "AVpGFiGBW2Sd8hDDcFxm",
                "_score": 0.3491456,
                "_source": {
                    "id": 29,
                    "qid1": 59,
                    "qid2": 60,
                    "content": "How do I recover my Facebook password without having to reset it?"
                }
            },
            {
                "_index": "stacktest",
                "_type": "question",
                "_id": "AVpGFiGBW2Sd8hDDcFxf",
                "_score": 0.24257512,
                "_source": {
                    "id": 11,
                    "qid1": 23,
                    "qid2": 24,
                    "content": "How easy is it to hack the login password of a Macbook Air?"
                }
            }
        ]
    }
}

我希望这能回答你原来的问题。

python - 在进行批量索引时无法删除弹性搜索中的停用词

issue_with_stop_word.csv 如下

代码如下

1 回答 1

Related

Reference