我正在关注 LSA 的教程并将示例切换到不同的字符串列表,我不确定代码是否按预期工作。
当我使用教程中给出的示例输入时,它会产生合理的答案。但是,当我使用自己的输入时,会得到非常奇怪的结果。
为了比较,以下是示例输入的结果:
当我使用自己的示例时,结果就是这样。另外值得注意的是,我似乎没有得到一致的结果:
任何帮助弄清楚为什么我会得到这些结果将不胜感激:)
这是代码:
import sklearn
# Import all of the scikit learn stuff
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import warnings
# Suppress warnings from pandas library
warnings.filterwarnings("ignore", category=DeprecationWarning,
module="pandas", lineno=570)
import numpy
example = ["Coffee brewed by expressing or forcing a small amount of
nearly boiling water under pressure through finely ground coffee
beans.",
"An espresso-based coffee drink consisting of espresso with
microfoam (steamed milk with small, fine bubbles with a glossy or
velvety consistency)",
"American fast-food dish, consisting of french fries covered in
cheese with the possible addition of various other toppings",
"Pounded and breaded chicken is topped with sweet honey, salty
dill pickles, and vinegar-y iceberg slaw, then served upon crispy
challah toast.",
"A layered, flaky texture, similar to a puff pastry."]
''''
example = ["Machine learning is super fun",
"Python is super, super cool",
"Statistics is cool, too",
"Data science is fun",
"Python is great for machine learning",
"I like football",
"Football is great to watch"]
'''
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(example)
pd.DataFrame(dtm.toarray(),index=example,columns=vectorizer.get_feature_names()).head(10)
# Get words that correspond to each column
vectorizer.get_feature_names()
# Fit LSA. Use algorithm = “randomized” for large datasets
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(dtm.astype(float))
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
pd.DataFrame(lsa.components_,index = ["component_1","component_2"],columns = vectorizer.get_feature_names())
pd.DataFrame(dtm_lsa, index = example, columns = "component_1","component_2"])
xs = [w[0] for w in dtm_lsa]
ys = [w[1] for w in dtm_lsa]
xs, ys
# Plot scatter plot of points
%pylab inline
import matplotlib.pyplot as plt
figure()
plt.scatter(xs,ys)
xlabel('First principal component')
ylabel('Second principal component')
title('Plot of points against LSA principal components')
show()
#Plot scatter plot of points with vectors
%pylab inline
import matplotlib.pyplot as plt
plt.figure()
ax = plt.gca()
ax.quiver(0,0,xs,ys,angles='xy',scale_units='xy',scale=1, linewidth = .01)
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])
xlabel('First principal component')
ylabel('Second principal component')
title('Plot of points against LSA principal components')
plt.draw()
plt.show()
# Compute document similarity using LSA components
similarity = np.asarray(numpy.asmatrix(dtm_lsa) *
numpy.asmatrix(dtm_lsa).T)
pd.DataFrame(similarity,index=example, columns=example).head(10)