不幸的是,没有开箱即用的一致性模型sklearn.decomposition.NMF
。
我遇到了同样的问题,并找到了一个使用 python 3.8 的自定义实现。
它应该很容易适应您的代码。请检查完整导入等的链接。
我最近使用此技术的片段:
kmin, kmax = 2, 30
topic_models = []
# try each value of k
for k in range(kmin,kmax+1):
print("Applying NMF for k=%d ..." % k )
# run NMF
model = decomposition.NMF( init="nndsvd", n_components=k )
W = model.fit_transform( A )
H = model.components_
# store for later
topic_models.append( (k,W,H) )
class TokenGenerator:
def __init__( self, documents, stopwords ):
self.documents = documents
self.stopwords = stopwords
self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )
def __iter__( self ):
print("Building Word2Vec model ...")
for doc in self.documents:
tokens = []
for tok in self.tokenizer.findall( doc ):
if tok.lower() in self.stopwords:
tokens.append( "<stopword>" )
elif len(tok) >= 2:
tokens.append( tok.lower() )
yield tokens
docgen = TokenGenerator(docs_raw, stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)
def calculate_coherence( w2v_model, term_rankings ):
overall_coherence = 0.0
for topic_index in range(len(term_rankings)):
# check each pair of terms
pair_scores = []
for pair in combinations( term_rankings[topic_index], 2 ):
#print(str(pair[0]) + " " + str(pair[1]))
pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
# get the mean for all pairs in this topic
topic_score = sum(pair_scores) / len(pair_scores)
overall_coherence += topic_score
# get the mean score across all topics
return overall_coherence / len(term_rankings)
def get_descriptor( all_terms, H, topic_index, top ):
# reverse sort the values to sort the indices
top_indices = np.argsort( H[topic_index,:] )[::-1]
# now get the terms corresponding to the top-ranked indices
top_terms = []
for term_index in top_indices[0:top]:
top_terms.append( all_terms[term_index] )
return top_terms
k_values = []
coherences = []
for (k,W,H) in topic_models:
# Get all of the topic descriptors - the term_rankings, based on top 10 terms
term_rankings = []
for topic_index in range(k):
term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
# Now calculate the coherence based on our Word2vec model
k_values.append( k )
coherences.append( calculate_coherence( w2v_model, term_rankings ) )
print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )
%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})
fig = plt.figure(figsize=(13,7))
# create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter( k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
# show the plot
plt.show()
结果:
K=02: Coherence=0.4157
K=03: Coherence=0.4399
K=04: Coherence=0.4626
K=05: Coherence=0.4333
K=06: Coherence=0.4075
K=07: Coherence=0.4121
...