我正在尝试使用烧瓶部署 XGBClassifier 模型。将值提供给网页上的相关字段后,它给了我以下错误:-
DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`.JobType, EdType, maritalstatus,
occupation, relationship, gender
我使用了一个热编码器来编码 7 个分类变量,其余 3 个是数字的。以下是我的代码: -
train_x, test_x, train_y, test_y = train_test_split(data1, y, test_size = 0.2,
random_state=69)
# IMPUTING NAN VALUES
train_x['JobType'].fillna(train_x['JobType'].value_counts().index[0], inplace = True)
train_x['occupation'].fillna(train_x['occupation'].value_counts().index[0], inplace = True)
test_x['JobType'].fillna(train_x['JobType'].value_counts().index[0], inplace = True)
test_x['occupation'].fillna(train_x['occupation'].value_counts().index[0], inplace = True)
# SEPARATING CATEGORICAL VARIABLES
train_x_cat = train_x.select_dtypes(include = 'object')
train_x_num = train_x.select_dtypes(include = 'number')
test_x_cat = test_x.select_dtypes(include = 'object')
test_x_num = test_x.select_dtypes(include = 'number')
#ONE HOT ENCODING THE CATEGORICAL VARIABLES AND THEN CONCAT THEM TO NUMERICAL VARIABLES
ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)
train_x_encoded = pd.DataFrame(ohe.fit_transform(train_x_cat))
train_x_encoded.columns = ohe.get_feature_names(train_x_cat.columns)
train_x_encoded = train_x_encoded.reset_index(drop = True)
train_x_num = train_x_num.reset_index(drop = True)
train_x1 = pd.concat([train_x_num, train_x_encoded], axis = 1)
test_x_encoded = pd.DataFrame(ohe.transform(test_x_cat))
test_x_encoded.columns = ohe.get_feature_names(test_x_cat.columns)
test_x_encoded = test_x_encoded.reset_index(drop = True)
test_x_num = test_x_num.reset_index(drop = True)
test_x1 = pd.concat([test_x_num, test_x_encoded], axis = 1)
#XGBC MODEL
model = XGBClassifier(random_state = 69)
#Hyperparameter tuning
def objective(trial):
learning_rate = trial.suggest_float('learning_rate', 0.001, 0.01)
n_estimators = trial.suggest_int('n_estimators', 10, 500)
sub_sample = trial.suggest_float('sub_sample', 0.0, 1.0)
max_depth = trial.suggest_int('max_depth', 1, 20)
params = {'max_depth' : max_depth,
'n_estimators' : n_estimators,
'sub_sample' : sub_sample,
'learning_rate' : learning_rate}
model.set_params(**params)
return np.mean(-1 * cross_val_score(model, train_x1, train_y,
cv = 5, n_jobs = -1, scoring = 'neg_mean_squared_error'))
xgbc_study = optuna.create_study(direction = 'minimize')
xgbc_study.optimize(objective, n_trials = 10)
xgbc_study.best_params
optuna_rfc_mse = xgbc_study.best_value
model.set_params(**xgbc_study.best_params)
model.fit(train_x1, train_y)
这是我的 Flask (app.py) 代码:-
@app.route('/', methods = ['GET', 'POST'])
def main():
if request.method == 'GET':
return render_template('index.html')
if request.method == "POST":
Age = request.form['age']
Jobtypes = request.form['JobType']
EducationType = request.form['EdType']
MaritalStatus = request.form['maritalstatus']
Occupation = request.form['occupation']
Relationship = request.form['relationship']
Gender = request.form['gender']
CapitalGain = request.form['capitalgain']
CapitalLoss = request.form['capitalloss']
HoursPerWeek = request.form['hrsperweek']
data = [[Age, Jobtypes, EducationType, MaritalStatus, Occupation, Relationship,
Gender, CapitalGain, CapitalLoss, HoursPerWeek]]
input_variables = pd.DataFrame(data, columns = ['age', 'JobType', 'EdType',
'maritalstatus', 'occupation',
'relationship', 'gender',
'capitalgain', 'capitalloss',
'hrsperweek'],
dtype = 'float', index = ['input'])
predictions = model.predict(input_variables)[0]
print(predictions)
return render_template('index.html', original_input = {'age':Age, 'JobType':Jobtypes,
'EdType':EducationType,
'maritalstatus':MaritalStatus,
'occupation':Occupation,
'relationship':Relationship,
'gender':Gender,
'capitalgain':CapitalGain,
'capitalloss':CapitalLoss,
'hrsperweek':HoursPerWeek},
result = predictions)
错误表明数据类型应该是 int、categorical、float 或 boolean。但是我已经对变量进行了编码,然后拟合了模型。那么为什么会出错呢?
提前致谢!