希望“结果”显示特征名称而不是索引。但无法生成,因为 get_feature_names() 不适用于 2d,并且 np.where() 仅返回索引号。
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp
import numpy as np
import pandas as pd
docs = ['this this this book',
'this cat good',
'cat good shit']
count_model = CountVectorizer(ngram_range=(2,2)) # default unigram model
X = count_model.fit_transform(docs)
X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
xx = Xc.todense()
tri_upper_no_diag = np.triu(xx, k=1)
yy = tri_upper_no_diag.tolist()
columns = count_model.get_feature_names()
# Create a numpy array from a list of numbers
arr = np.array(yy)
result = np.where(arr > 0)
result