import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/kaglee/pokemon_alopez247.csv")
我对数据进行了预处理。
df.columns
Index(['dark', 'dragon', 'electric', 'fairy', 'fighting', 'fire', 'flying',
'ghost', 'grass', 'ground', 'ice', 'normal', 'poison', 'psychic',
'rock', 'steel', 'Ditto', 'bug', 'dragon', 'fairy', 'field', 'flying',
'grass', 'human-Like', 'mineral', 'monster', 'water_1', 'water_2',
'water_3', 'legend', 'Color_Blue', 'Color_Brown', 'Color_Green',
'Color_Grey', 'Color_Pink', 'Color_Purple', 'Color_Red', 'Color_White',
'Color_Yellow', 'body_bipedal_tailless', 'body_four_wings',
'body_head_arms', 'body_head_base', 'body_head_legs', 'body_head_only',
'body_insectoid', 'body_multiple_bodies', 'body_quadruped',
'body_serpentine_body', 'body_several_limbs', 'body_two_wings',
'body_with_fins'],
dtype='object')
df.shape
(721, 52)
X = df.drop('legend',axis=1)
y = df['legend']
X.shape,y.shape # ((721, 51), (721,))
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20,
stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y,test_size=0.20)
# Logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_train_scaled
X_train_scaled.T[0].mean(), X_train_scaled.T[0].std()
model = LogisticRegression(random_state=1)
model.fit(X_train_scaled, y_train)
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_val_scaled)
accuracy_score(y_val, y_pred) # 0.9310344827586207
X_test_scaled = scaler.transform(X_test)
y_pred_test = model.predict(X_test_scaled)
accuracy_score(y_test, y_pred_test) # 0.9448275862068966
# RandomForest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
rfc = make_pipeline(RandomForestClassifier(criterion = 'entropy',n_jobs=-1, random_state=2,n_estimators=200))
rfc.fit(X_train, y_train)
print('test accuracy', rfc.score(X_test, y_test)) # 0.9379
# PDP , SHAP
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 80
from pdpbox.pdp import pdp_isolate, pdp_plot
import shap
import numpy as np
feature = 'bug'
isolated = pdp_isolate(
model=model,
dataset=X_train,
model_features=X_train.columns,
feature=feature
);
ValueError:意外的输入维度 59,预期为 51
ValueError:模型的特征数量必须与输入匹配。模型 n_features 为 51,输入 n_features 为 59
这两种类型的错误就出现了。
我之所以写,是因为即使搜索 Stack Overflow,我也无法理解我做错了什么。
feature = " " 无论我输入什么功能,错误都会出现。