1

我想用回归模型做预测。

我尝试优化我的 LightGBM 模型以获得最佳超参数,同时针对最低泛化 RMSE 分数而不会过拟合/欠拟合。

我见过的所有示例都使用分类并随机拆分而不知道时间序列数据 + 使用 GridSearch,这些都不适用于我的问题。

如何在使用嵌套 CV 和 TimeSeriesSplit 时为我的最终模型获得贝叶斯超参数优化?

到目前为止,我的简单简历代码:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

... import data via pandas ...

y = df["target"] # predictor y
features = df.drop("target", axis=1).columns
X = df[traffic_features] # features X

days = len(df)- 60 # 2 Months for test data / ~20%
X_train, X_test = X[:days], X[days:]
y_train, y_test = y[:days], y[days:]

# hyperopt

random_state = 42

def lightgbm_cv(params, random_state=random_state, cv=cvTSS, X=X_train, y=y_train):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']), 
        'learning_rate': params['learning_rate'],
        'min_child_weight': params['min_child_weight'],
        'feature_fraction': params['feature_fraction'],
        'bagging_fraction': params['bagging_fraction'],
        'bagging_freq': int(params['bagging_freq']),
        'num_leaves': int(params['num_leaves']),
        'max_bin': int(params['max_bin']),
        'num_iterations': int(params['num_iterations']),
        'objective': 'rmse',
        }
    
    # we use this params to create a new LGBM Regressor
    model = lgb.LGBMRegressor(random_state=random_state, **params)
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1).mean()
    print(score)

    return score

space={
    'n_estimators': hp.quniform('n_estimators', 100, 10_000, 1),
    'max_depth' : hp.quniform('max_depth', 2, 100, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 2),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'feature_fraction': hp.quniform('feature_fraction', 0.1, 1, 0.1),
    'bagging_fraction': hp.quniform('bagging_fraction', 0.1, 1, 0.1),
    'bagging_freq': hp.quniform('bagging_freq', 1, 1_000, 1),
    "num_leaves": hp.quniform('num_leaves', 10, 1_000, 1),
    "max_bin": hp.quniform('max_bin', 10, 2_000, 1),
    "num_iterations": hp.quniform('num_iterations', 100, 10_000, 1),
    'objective': 'rmse',
    #'verbose': 0,
      }

# trials will contain logging information
trials = Trials()

cvTSS = TimeSeriesSplit(max_train_size=None, n_splits=10) #

n_iter = 100

best=fmin(fn=lightgbm_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization, hyperotp will select its parameters automatically
          max_evals=n_iter, # maximum number of iterations
          trials=trials, # logging
          stratified = False,
          rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
         )

# computing the score on the test set - some parameters from "space" are missing here, not important atm
model = lgb.LGBMRegressor(random_state=random_state, n_estimators=int(best['n_estimators']),
                      max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])
model.fit(X_train, y_train)
tpe_test_score = mean_squared_error(y_test, model.predict(X_test), squared=False)

print("Best RMSE {:.3f} params {}".format( lightgbm_cv(best), best))
4

0 回答 0