0

我正在对亚马逊食品评论进行情绪分析,我正在尝试将 Word2Vec 应用于评论并使用 t-SNE 将其可视化。

使用以下代码,我可以轻松地使用 Bag of words 表示来进行可视化:

    from sklearn.manifold import TSNE
    data_2000 = final_counts[0:2000,:]
    top_2000 = data_2000.toarray()
    labels = final['Score']
    labels_2000 = labels[0:2000]

    model = TSNE(n_components=2, random_state=0)
    tsne_data = model.fit_transform(top_2000)

    # creating a new data frame which help us in ploting the result 

      tsne_data = np.vstack((tsne_data.T, labels_2000)).T
      tsne_df = pd.DataFrame(data=tsne_data, columns=("Dim_1", "Dim_2", 
      "label"))

    # Ploting the result of tsne

       sns.FacetGrid(tsne_df, hue="label", size=6).map(plt.scatter, 
      'Dim_1', 'Dim_2').add_legend()
       plt.show()

此外,当我提供类型为 gensim.models.word2vec.Word2Vec 的 w2v_model 模型时,相同的代码不起作用

我使用以下代码获得了模型:

     w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, 
     workers=4)
4

2 回答 2

0

训练模型后,您需要提取所有词嵌入。我建议通过以下方式提取到 pd.DataFrame 中:

all_vocab = list(w2v_model.wv.vocab.keys())
data_dict = {word: w2v_model.wv[word] for word in all_vocab}
result = pd.DataFrame(data=data_dict).transpose()

如果您想在 scikit 中执行降维,只需通过以下方式访问嵌入数组result.values

于 2018-05-06T15:11:54.313 回答
0
from torchtext import vocab
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

glove = vocab.GloVe(name = '6B', dim = 100)

print(f'There are {len(glove.itos)} words in the vocabulary')

def tsne_plot(glove,n=200,n_components=2):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word , tensor_value in zip(glove.itos[:n],glove.vectors[:n]):
        tokens.append(tensor_value.numpy())
        labels.append(word)

    tsne_model = TSNE(perplexity=40, n_components=n_components, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    fig = plt.figure(figsize=(16, 16))
    if n_components==3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(new_values[:,0],new_values[:,1],new_values[:,2],c="r",marker="o")
        for i in range(len(new_values)):
            ax.text(new_values[i][0],new_values[i][1],new_values[i][2],labels[i])
    else:
        plt.scatter(new_values[:,0],new_values[:,1])
        for i in range(len(new_values)):
            plt.annotate(labels[i],
                        xy=(new_values[i][0],new_values[i][1]),
                        xytext=(5, 2),
                        textcoords='offset points',
                        ha='right',
                        va='bottom')
    return new_values,labels
new_values,labels = tsne_plot(glove,n_components=2)
于 2019-12-07T16:44:20.747 回答