python - 使用随机森林应用分层 10 折交叉验证

Question

我是machine learning. 我有dataset没有标准化，但我会StandardScaler在过程中使用。我有多类（1、2、...、10 类）

我想知道如何应用 10 折交叉验证而不是 train_test_split。

#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

#Creating Dataset and including the first row by setting no header as input
dataset = pd.read_csv('finalDataset.csv')
#Renaming the columns
#print('Shape of the dataset: ' + str(dataset.shape))
#print(dataset.head())

#Creating the dependent variable class
factor = pd.factorize(dataset['DJ class'])
definitions = factor[1]
#print(definitions)

#Splitting the data into independent and dependent variables
X = dataset.iloc[:,3:1941].values
y = dataset.iloc[:,0].values
#print('The independent features set: ')
#print(X[:5,:])
#print('The dependent variable: ')
#print(y[:5])

# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 30)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 40)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to Iris-setosa, Iris-versicolor and Iris-virginica
reversefactor = dict(zip(range(1,11),definitions))
#print(reversefactor)
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual DJ'], colnames=['Predicted DJ']))

sk_report = classification_report(
    digits=6,
    y_true=y_test, 
    y_pred=classifier.predict(X_test))
print(sk_report)
#
#
print('accuracy_score', accuracy_score(y_test, classifier.predict(X_test)))
cm = confusion_matrix(y_test, classifier.predict(X_test))
print(cm)

# save the model to disk
modelFilename = 'randomforestmodel.pkl'
if (accuracy_score(y_test, classifier.predict(X_test))*100) > 75:
    joblib.dump(classifier, modelFilename) 
    print("Saved model to disk")

谁能帮我？谢谢

python - 使用随机森林应用分层 10 折交叉验证

0 回答 0

Related

Reference