这是一个供您考虑的通用示例。
import pandas as pd
import numpy as np
# Read dataset
df = pd.read_csv('balance-scale.data',
names=['balance', 'var1', 'var2', 'var3', 'var4'])
# Display example observations
df.head()
df['balance'].value_counts()
# R 288
# L 288
# B 49
# Name: balance, dtype: int64
# Transform into binary classification
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
df['balance'].value_counts()
# 0 576
# 1 49
# Name: balance, dtype: int64
# About 8% were balanced
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Next, we'll fit a very simple model using default settings for everything.
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
# Train model
clf_0 = LogisticRegression().fit(X, y)
# Predict on training set
pred_y_0 = clf_0.predict(X)
# How's the accuracy?
print( accuracy_score(pred_y_0, y) )
# 0.9216
# So our model has 92% overall accuracy, but is it because it's predicting only 1 class?
# Should we be excited?
print( np.unique( pred_y_0 ) )
# [0]
# at this point, we need to use RESAMPLING!
from sklearn.utils import resample
# Separate majority and minority classes
# upsample the miniority class
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
# Upsample minority class
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples=576, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
# Display new class counts
df_upsampled.balance.value_counts()
# 1 576
# 0 576
# Name: balance, dtype: int64
# Separate input features (X) and target variable (y)
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)
# Train model
clf_1 = LogisticRegression().fit(X, y)
# Predict on training set
pred_y_1 = clf_1.predict(X)
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889
# Great, now the model is no longer predicting just one class. While the accuracy also # took a nosedive, it's now more meaningful as a performance metric.
# now we need to downsample the majority class
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
# Downsample majority class
df_majority_downsampled = resample(df_majority,
replace=False, # sample without replacement
n_samples=49, # to match minority class
random_state=123) # reproducible results
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
# Display new class counts
df_downsampled.balance.value_counts()
# 1 49
# 0 49
# Name: balance, dtype: int64
# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)
# Train model
clf_2 = LogisticRegression().fit(X, y)
# Predict on training set
pred_y_2 = clf_2.predict(X)
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
# 0.581632653061
永远记住,随机森林算法可以很好地处理不平衡的数据集,所以也许这就是你所需要的!我通常从随机森林开始每个实验。如果这产生了我想要的结果,我就完成了。无需寻找和挑选宇宙中最好的算法。您也可以轻松地自动化在任何给定数据集上测试数十种算法的过程。
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
# Predict on training set
pred_y_4 = clf_4.predict(X)
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
# [0 1]
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )
# 0.9744
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )
# 0.999078798186
参考:
https://elitedatascience.com/imbalanced-classes