我有以下用于主题建模的代码,我需要从 CSV 文件中获取文本数据,并将它们分组为名为 Topic 0 和 Topic 1 的 2 个主题:
Input: Website name, Text
Output: Website name, Text, Topic
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
for doc_index in top_doc_indices:
print documents[doc_index]
documents=[//data belonging to "Text" column from csv file goes in here// ]
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_
no_top_words = 2
no_top_documents = 4
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
例子:
输入:
Website name Text
------------------------------
www.xyz.com I eat bananas
www.abc.com I eat apples
www.123.com I drive a bike
www.abx.com I drive a car
输出:
Website name Text Topic
--------------------------------------------
www.xyz.com I eat bananas Topic 0
www.abc.com I eat apples Topic 0
www.123.com I drive a bike Topic 1
www.abx.com I drive a car Topic 1