我的代码如下。即使我已将 ["is","it","possible"] 列为停用词过滤器,但仍然在输出搜索中得到它。有人可以帮助解释为什么弹性搜索在索引时没有从输入文档中删除它们吗?
issue_with_stop_word.csv 如下
id,qid1,qid2,question1
5,11,12,How do I recover my Facebook login password?
7,15,16,Is it possible to sleep without dreaming?
11,23,24,How easy is it to hack the login password of a Macbook Air?
12,25,26,How easy is it to hack the login password of a Macbook Air?
13,27,28,Is it possible to know who visited my Facebook profile?
15,31,32,Is it possible to know who visited my Facebook profile?
16,33,34,Is it possible to know who visited my Facebook profile?
18,37,38,Is it possible to hack someone's Facebook messages?
20,41,42,Is it possible to know who visited my Facebook profile?
29,59,60,How do I recover my Facebook password without having to reset it?
31,63,64,What are some special cares for someone with a nose that gets stuffy during the night?
32,65,66,What Game of Thrones villain would be the most likely to give you mercy?
代码如下
from elasticsearch import Elasticsearch
from elasticsearch import helpers
query='Is it possible ?'
index_name = 'sample'
doc_type = 'dummy'
content = 'content'
document = 'question'
identity = 'id'
def main():
es = Elasticsearch('localhost:9200')
create_indices(es, index_name)
res = es.search(index=index_name, doc_type=doc_type,
body={
"query": {
"match": {
'content': "is it possible"
}
}
})
print("%d documents found:" % len(res['hits']['hits']))
for doc in res['hits']['hits']:
print("%s) %s %s" % (doc['_id'], doc['_source']['content'], str(doc['_score'])))
def create_indices(es, index_name):
bulk_data = []
with open('issue_with_stop_word.csv', 'rb') as tsvin:
tsvin.next()
for row in tsvin:
row = unicode(row, errors='replace')
doc = str(row.split(',')[3]).strip()
int_id = int(row.split(',')[1])
value = dict()
value[content] = doc
value[identity] = int_id
bulk_data.append(value)
if es.indices.exists(index_name):
print("deleting '%s' index..." % (index_name))
res = es.indices.delete(index=index_name)
print(" response: '%s'" % (res))
# since we are running locally, use one shard and no replicas
request_body = {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"my_stop": {
"type": "stop",
"stopwords": ["is","it","possible"]
}
}
}
}
}
print("creating '%s' index..." % (index_name))
res = es.indices.create(index=index_name, body=request_body)
print(" response: '%s'" % (res))
# bulk index the data
print("bulk indexing...")
actions = [
{
"_index": index_name,
"_type" : doc_type,
"_id": val[identity],
content:val[content]
}
for val in bulk_data
]
res = helpers.bulk(es, actions, refresh = True)
if __name__ == '__main__':
main()