我已经使用 gensim 运行了 LSI,但我不知道如何获得覆盖率,即描述语料库的主题数 n。
我能够获得主题向量,但我可以使用它吗?它与连贯性或困惑不同吗?
def prepare_corpus(doc_clean):
"""
Input : clean document
Output : term dictionary and Document Term Matrix
"""
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
tfidf_instance=models.TfidfModel(doc_term_matrix)
tfidf_corpus=tfidf_instance[doc_term_matrix]
# generate LDA model
return dictionary,doc_term_matrix,tfidf_corpus
def create_gensim_lsa_model(doc_clean,number_of_topics):
"""
Input : clean document, number of topics and number of words associated with each topic
Purpose: create LSA model using gensim
Output : return LSA model
"""
dictionary,doc_term_matrix, tfidf_corpus =prepare_corpus(doc_clean)
# generate LSA model
#lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary) # train model
lsamodel_tfidf = LsiModel(tfidf_corpus, num_topics=number_of_topics, id2word = dictionary)
return lsamodel_tfidf