我使用svm
模型scikit-learner
来预测 MNIST 中的笔迹。
但是,我得到的结果很混乱。当我使用经过训练的模型来预测已经在学习过程中使用的训练集时,准确率是 100%
在处理测试数据时,我只得到了大约 11% 的准确率。
除了过度拟合,我找不到原因。过拟合对结果有这么大的影响吗?
# coding:utf-8
from numpy import *
from sklearn import svm
from sklearn.externals import joblib
def loadData(fileName):
fr = open(fileName)
numFeat = len(fr.readline().split(',')) - 1
featMatTrain = []
labelVecTrain = []
featMatTest = []
labelVecTest = []
i = 0
for line in fr.readlines():
i = i + 1
if i != 1 and i <=30000:
curLine = line.strip().split(',')
curLine = map(float,curLine)
labelVecTrain.append(curLine[0])
featMatTrain.append(curLine[1:numFeat])
if i >= 30000:
curLine = line.strip().split(',')
curLine = map(float,curLine)
labelVecTest.append(curLine[0])
featMatTest.append(curLine[1:numFeat])
print '*************************** the training data we got: *****************************'
print 'featMat:''type of element:',type(featMatTrain) ,'shape of featMat:', shape(featMatTrain)
print 'labelVec:''type of element:',type(labelVecTrain),'shape of labelVec:',shape(labelVecTrain)
print 'featMat:''type of element:',type(featMatTest) ,'shape of featMat:', shape(featMatTest)
print 'labelVec:''type of element:',type(labelVecTest),'shape of labelVec:',shape(labelVecTest)
return array(featMatTrain),array(labelVecTrain),array(featMatTest),array(labelVecTest)
featMatTrain,labelVecTrain,featMatTest,labelVecTest= loadData('C:/Users/sun/Desktop/train.csv')
clf = svm.SVC()
clf.fit(featMatTrain,labelVecTrain)
joblib.dump(clf,'svmModel.pkl')
print '***************** we finish training **********************'
labelVecPredict1 = clf.predict(featMatTrain)
labelVecPredict2 = clf.predict(featMatTest)
print '***************** we finish predicting **********************'
count1 = 0.0
for i in range(len(featMatTrain)):
if labelVecPredict1[i] == labelVecTrain[i]:
count1 = count1 + 1
print '************* the result of predicting training set ***************'
print 'the number of figures that predict right: ',count1
print 'the accuary is :',count1/len(featMatTrain)
count2 = 0.0
for i in range(len(featMatTest)):
if labelVecPredict2[i] == labelVecTest[i]:
count2 = count2 + 1
print '************ the result to predicting testing set ************'
print 'the number of figures that predict right:',count2
print 'the accuary is:',count2/len(featMatTest)