python - clf.fit(X, y) 没有以小值运行 - fit() 永远不会完成 - 潜在的 sklearn 错误：数据科学

Question

这不是解决我的代码问题。（代码有效，但不适用于使用的数据）。这个问题直接与作为技术查询的 sklearn fit() 函数调用有关。

下面的模块调用。

 from sklearn import svm 
 svm.SVC.fit(X,y)

代码应该适合数据（下面提供的链接）

产生如下图所示的图：

寻找有关以下代码的性能或错误问题的专家意见，请注意代码运行但它是我正在运行的数据。当我传入基于日志的 t-SNE 模型时，模型将不适合，我已经运行了几个小时，它应该在几秒钟内完成。

我尝试过的事情：等待它完成几个小时旋转一个盒子并支付谷歌为一个野兽盒子

我开始认为这可能与所有花车都太小有关。但是 float32 约束应该可以做到这一点。非常感谢任何建议或想法。

我在这里使用的数据(X)

对应的y值可以在这里获取

print ("start")

import matplotlib.pyplot as plt
from sklearn import svm


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

from sklearn import metrics
from sklearn.metrics import roc_curve, auc

from sklearn import cross_validation
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import manifold, datasets, decomposition, discriminant_analysis

def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


print ('read packages')
########################################    Log data    ########################################\

# df = pd.read_csv('logValuesOfWineData.csv').dropna().astype(np.float32)
# y = df['qualityBand'].values.astype(int)
# y = y.values
# subdf = df[[ 'Logfixed_acidity', 'Logvolatile_acidity','Logcitric_acid', 'Logresidual_sugar', 'Logchlorides',
#        'Logfree_sulfur_dioxide', 'Logtotal_sulfur_dioxide', 'Logdensity',
#        'Logsulphates', 'Logalcohol']]
# y = df['qualityBand'].map({1: 1, 2:2, 3:3})
# # removed free sulfar, rewsidual sugar, volatile acidity looks too normal
# # subdf = df[[ 'Logdensity','Logalcohol']]
# X = subdf.values

########################################    normal data    ########################################

# names = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
#        'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
#        'pH', 'sulphates', 'alcohol']


df = pd.read_csv('winequalityN.csv').dropna().astype(np.float32)
y = df['qualityBand'].values.astype(int)
# y = y.values
subdf = df[['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]

y = df['qualityBand'].map({1: 0, 2:0, 3:1})
X = subdf.values


print ('read data')
##################################  PCA      ######################################################
# X = subdf.values

# print ('about to PCA')
# X_pca = decomposition.PCA(n_components=2).fit_transform(X)
# X = X_pca

kk = pd.read_csv('test.csv').dropna().astype(np.float32)
X = kk.values
# kk = pd.read_csv('dfX_pca_Normal.csv').dropna().astype(np.float32)
# X = kk.values

print ('finished PCA')
##################################  X_tsne     ######################################################
# X_tsne = manifold.TSNE(n_components=2, init='pca').fit_transform(X)
# X = X_tsne
# kk = pd.read_csv('dfX_tsne_log.csv').dropna().astype(np.float32)
# X = kk.values
# kk = pd.read_csv('dfX_tsne_Normal.csv').dropna().astype(np.float32)
# X = kk.values
##############################################################################################

y = df['qualityBand'].map({1: 0, 2:0, 3:1})
y = y.values
print ('started')

# need cross val 

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.20, random_state=5)

# # we create 40 separable points
# np.random.seed(0)
# X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
# Y = [0] * 20 + [1] * 20

# fit the model
# Model 1
clf = svm.SVC(kernel='linear', probability = True)


print ('about to fit')
clf.fit(X, y)

print ('fit')
print ('model fit')


# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]

# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])

# plot the line, the points, and the nearest vectors to the plane
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')

plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=80, facecolors='none')
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)


class_names = ['0','1']



plt.ylabel('PCA 1')
plt.xlabel('PCA 2')
plt.title('Transformed Support Vector Machine {1: 0, 2:0, 3:1}')
plt.axis('tight')




plt.show()
clf.predict(X_test)
probas = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)  


print (classification_report(y_test, y_pred))


import scikitplot as skplt
import matplotlib.pyplot as plt

# y_pred = clf.predict(X_train)  

# y_true = # ground truth labels
# y_probas = # predicted probabilities generated by sklearn classifier


skplt.metrics.plot_roc_curve(y_test, probas)
# plt.show()
print (classification_report(y_test, y_pred))


# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)

np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()

plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='SVM Line: Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='SVM Line: Normalized confusion matrix')

plt.show()



print("Accuracy", metrics.accuracy_score(y_test, y_pred))


print ('finito')

python - clf.fit(X, y) 没有以小值运行 - fit() 永远不会完成 - 潜在的 sklearn 错误：数据科学

0 回答 0

Related

Reference