这是我训练我的 doc2vec 模型的代码
from gensim.models.doc2vec import Doc2Vec
from FileDocIterator import FileDocIterator
doc_file_name = 'doc_6million.txt'
docs = FileDocIterator(doc_file_name)
print "Fitting started"
model = Doc2Vec(docs, size=100, window=5, min_count=5, negative=20, workers=6, iter=4)
print "Saving model"
model.save("doc2vec_model")
print "model saved"
现在让我们来看看FileDocIterator
import json
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Phrases
class FileDocIterator(object):
def __init__(self, fileName):
self.fileName = fileName
self.phrase = Phrases.load("phrases")
def __iter__(self):
for line in open(self.fileName):
jsData = json.loads(line)
yield TaggedDocument(words=jsData["data"], tags=jsData["id"])
现在我明白在这个实现中没有使用短语,但请耐心等待,让我们看看数据的样子。这是第一个数据点
{"data":["strategic","and","analytical","technical","program","director","and","innovator","who","inspires","calculated","risk-taking","in","emerging","technologies",",","such","as","cyber","security",",","risk",",","analytics",",","big","data",",","cloud",",","mobility","and","3d","printing",".","known","for","growing","company","profit","through","innovative","thinking","aimed","at","improving","employee","productivity","and","providing","solutions","to","private","industry","and","government","customers",".","recognized","for","invigorating","creative","thinking","and","collaboration","within","large","companies","to","leverage","their","economies","of","scale","to","capture","market","share",".","successful","in","managing","the","risk","and","uncertainty","throughout","the","innovation","lifecycle","by","leveraging","an","innovation","management","framework","to","overcome","barriers",".","track","record","of","producing","results","in","competitive",",","rapidly","changing","environments","where","innovation","and","customer","satisfaction","is","the","business",".","competencies","include",":","innovation","management","cyber",",","risk",",","analytics",",","cloud","computing","and","mobility","technology","development","security","compliance",":","dod/ic","(","nispom",",","icd","503",",","fedramp",")","commercial","(","iso/iec","27002",",","pci","dss",")","relationship","management",":","dod",",","public","sector","and","intelligence","community","change","management","it","security","&","risk","management","(","cissp",")","program",",","product","&","portfolio","management","(","pmp",")","data","analytics","management","(","cchd",")","itil","service",
"management","(","itilv3-expert",")"],
"id":"55c37f730d03382935e12767"}
我的理解是id,55c37f730d03382935e12767
应该是文档的id,所以做以下应该给我一个docVector。
model.docvecs["55c37f730d03382935e12767"]
相反,这是输出的内容。
>>> model.docvecs["55c37f730d03382935e12767"]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 341, in __getitem__
return self.doctag_syn0[self._int_index(index)]
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 315, in _int_index
return self.max_rawint + 1 + self.doctags[index].offset
KeyError: '55c37f730d03382935e12767'
试图获得最相似的结果如下
>>> model.docvecs.most_similar("55c37f730d03382935e12767")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 450, in most_similar
raise KeyError("doc '%s' not in trained set" % doc)
KeyError: "doc '55c37f730d03382935e12767' not in trained set"
我想了解的是如何保存文档向量以及使用什么 id。我的方法的哪一部分在上面不起作用?
现在这里有一些有趣的事情,如果我执行以下操作,我会返回类似的文档向量,但它们对我没有任何意义。
>>> model.docvecs.most_similar(str(1))
[(u'8', 0.9000369906425476), (u'3', 0.8878246545791626), (u'7', 0.886141836643219), (u'2', 0.8834314942359924), (u'e', 0.8812381029129028), (u'a', 0.8648831248283386), (u'd', 0.8587037920951843), (u'0', 0.8413013219833374), (u'4', 0.8385311365127563), (u'c', 0.8290119767189026)]