我正在尝试计算每个主题的一致性值,但每次运行代码时都会生成不同的值。
如果有人能回答这个问题,那将是一个很大的帮助。
先感谢您
def build_w2c(self, raw_documents):
docgen = TokenGenerator(raw_documents, self.stop_words)
new_list = []
for each in docgen.documents:
new_list.append(each.split(" "))
# print(new_list)
# Build the word2vec model
self.w2v_model = gensim.models.Word2Vec(size=500, min_count=0.0005, sg=1)
self.w2v_model.build_vocab(sentences=new_list)
return self.w2v_model
def get_descriptor(self, all_terms, H, topic_index, top):
# reverse sort the values to sort the indices
top_indices = np.argsort(H[topic_index, :])[::-1]
# now get the terms corresponding to the top-ranked indices
top_terms = []
for term_index in top_indices[0:top]:
top_terms.append(all_terms[term_index])
return top_terms
def get_coherence(self, k, terms, H):
k_values = []
term_rankings = []
coherences = []
dict = {}
for topic_index in range(1, k):
print(topic_index)
descriptor = self.get_descriptor(terms , H, topic_index, 10)
term_rankings.append(descriptor)
# Now calculate the coherence based on our Word2vec model
#coherence = self.calculate_coherence(term_rankings)
coherences.append(self.calculate_coherence(term_rankings))
print("K=%02d: Coherence=%.4f" % (topic_index, coherences[-1]))
k_values.append(topic_index)
dict[topic_index] = coherences[-1]
max_key = max(dict, key=dict.get)
return k_values, coherences, max_key
def calculate_coherence(self, term_rankings):
overall_coherence = 0.0
for topic_index in range(len(term_rankings)):
# check each pair of terms
pair_scores = []
for pair in combinations(term_rankings[topic_index], 2):
pair_scores.append(self.w2v_model.similarity(pair[0], pair[1]))
# get the mean for all pairs in this topic
topic_score = sum(pair_scores) / len(pair_scores)
overall_coherence += topic_score
# get the mean score across all topics
return overall_coherence / len(term_rankings)
这是我在项目中使用的代码。
输出要求:每次我运行我的代码一致性应该是相同的。
如果您能帮助我解决这种方法,那将是一个很大的帮助
太感谢了。