1

我正在尝试使用本教程使用 PyLDAvis 可视化主题模型(使用 gensim LDA 模型构建)的结果,但我不断收到这个著名的错误

index 11588 is out of bounds for axis 1 with size 11588

我试图在 stackoverflow 和 GitHub 上进行搜索,我发现很多人之前都遇到过这个问题,但是在旧版本中,我目前使用的是 PyLDAvis 版本 3.2.2(我尝试了最新的但徒劳无功)

我是python和机器学习的新手,所以我无法调试问题任何帮助或指导将不胜感激

这是我的 jupyter 笔记本代码:

#tokenize, remove stopwords, non-alphabetic words, lowercase
filename = 'booksummaries.txt'
tokenized_docs_summaries = []
for line in open(filename, encoding="utf-8"):
    temp = line.split("\t")
#     print(preprocess(temp[6]))
    tokenized_docs_summaries.append(preprocess(temp[6]))
dictionary = Dictionary(tokenized_docs_summaries)

# os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'}) 
# #You should update this path as per the path of Mallet directory on your system.
# mallet_path = r'C:/mallet-2.0.8/bin/mallet' 
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet' # you should NOT need to change this 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=tokenized_docs_summaries, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=tokenized_docs_summaries, start=2, limit=40, step=4)

optimal_model = model_list[best_result_index]

# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
def convertldaGenToldaMallet(mallet_model):
    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, eta=0,
    )
    model_gensim.state.sstats[...] = mallet_model.wordtopics
    model_gensim.sync_state()
    return model_gensim

optimal_model = convertldaGenToldaMallet(optimal_model)
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(optimal_model, corpus, dictionary)
p
4

0 回答 0