代码:-
import sys
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
model = Doc2Vec.load('sentiment140.d2v')
if len(sys.argv) < 4:
print ("Please input train_pos_count, train_neg_count and classifier!")
sys.exit()
train_pos_count = int(sys.argv[1])
train_neg_count = int(sys.argv[2])
test_pos_count = 144
test_neg_count = 144
print (train_pos_count)
print (train_neg_count)
vec_dim = 100
print ("Build training data set...")
train_arrays = numpy.zeros((train_pos_count + train_neg_count, vec_dim))
train_labels = numpy.zeros(train_pos_count + train_neg_count)
for i in range(train_pos_count):
prefix_train_pos = 'TRAIN_POS_' + str(i)
train_arrays[i] = model.docvecs[prefix_train_pos]
train_labels[i] = 1
for i in range(train_neg_count):
prefix_train_neg = 'TRAIN_NEG_' + str(i)
train_arrays[train_pos_count + i] = model.docvecs[prefix_train_neg]
train_labels[train_pos_count + i] = 0
print ("Build testing data set...")
test_arrays = numpy.zeros((test_pos_count + test_neg_count, vec_dim))
test_labels = numpy.zeros(test_pos_count + test_neg_count)
for i in range(test_pos_count):
prefix_test_pos = 'TEST_POS_' + str(i)
test_arrays[i] = model.docvecs[prefix_test_pos]
test_labels[i] = 1
for i in range(test_neg_count):
prefix_test_neg = 'TEST_NEG_' + str(i)
test_arrays[test_pos_count + i] = model.docvecs[prefix_test_neg]
test_labels[test_pos_count + i] = 0
print ("Begin classification...")
classifier = None
if sys.argv[3] == '-lr':
print ("Logistic Regressions is used...")
classifier = LogisticRegression()
elif sys.argv[3] == '-svm':
print ("Support Vector Machine is used...")
classifier = SVC()
elif sys.argv[3] == '-knn':
print ("K-Nearest Neighbors is used...")
classifier = KNeighborsClassifier(n_neighbors=10)
elif sys.argv[3] == '-rf':
print ("Random Forest is used...")
classifier = RandomForestClassifier()
classifier.fit(train_arrays, train_labels)
print ("Accuracy:", classifier.score(test_arrays, test_labels))
出现以下错误:- 2017-06-08 15:24:18,013:INFO:从 C:/Users/Desktop/sentiment140.d2v 加载 Doc2Vec 对象 2017-06-08 15:24:21,556:INFO:从递归加载 wv C:/Users/Desktop/sentiment140.d2v.wv.* with mmap=None 2017-06-08 15:24:21,556:INFO:将忽略的属性 syn0norm 设置为 None 2017-06-08 15:24:21,571:INFO:从 C:/Users/Desktop/sentiment140.d2v.docvecs.* 递归加载 docvecs,mmap=None 2017-06-08 15:24:21,571:INFO:将忽略的属性 cum_table 设置为 None 2017-06-08 15:24: 21,571 : INFO : 已加载 C:/Users/Desktop/sentiment140.d2v 请输入 train_pos_count、train_neg_count 和分类器!C:\Users\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2889:用户警告:要退出:使用“退出”、“退出”或 Ctrl-D。warn("要退出:使用 'exit',
系统退出