我的数据集有超过 10 万个观察值和 120 个特征。我想使用 XGBoost 和 Dask Gridsearchcv 来找到最佳参数。但是它抛出了一个错误。请让我知道我做错了什么
代码:
import dask_ml.model_selection as dcv
import dask.dataframe as dd
pipe1=Pipeline(steps=[('clf',XGBClassifier(objective= 'binary:logistic'))])
search_space=[{'clf__n_estimators': [100,150, 200, 300],
'clf__max_depth': [4,6,9],
'clf__learning_rate': [0.1, 0.01, 0.05],
'clf__random_state': [0],
'clf__subsample': [0.7,0.9,1],
'clf__colsamplebytree': [1.0, 0.9, 0.8]}]
grid_cpu=dcv.GridSearchCV(estimator=pipe1, param_grid=search_space, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1,refit="AUC",scheduler='threads')
grid_cpu.fit(dd.from_pandas(X,npartitions=1), dd.from_pandas(y,npartitions=1))
print("Best parameters found: ",grid_mse.best_params_)
print("Best ROC found: ", np.sqrt(np.abs(grid_mse.best_score_)))
这里使用的 X 和 y 最初是一个 pandas 数据框。我试图在建模之前将其转换为 dask 数据框。我猜我在那里遗漏了一些东西。
错误:
AssertionError Traceback (most recent call last)
<ipython-input-186-5a2967c1ae38> in <module>
13 grid_cpu=dcv.GridSearchCV(estimator=pipe1, param_grid=search_space, cv=5, scoring='roc_auc', return_train_score=True, n_jobs=-1,
14 refit="AUC",scheduler='threads')
---> 15 grid_cpu.fit(dd.from_pandas(X,npartitions=1), dd.from_pandas(y,npartitions=1))
16 print("Best parameters found: ",grid_mse.best_params_)
17 print("Best ROC found: ", np.sqrt(np.abs(grid_mse.best_score_)))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask_ml\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
1231 error_score=error_score,
1232 return_train_score=self.return_train_score,
-> 1233 cache_cv=self.cache_cv,
1234 )
1235
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask_ml\model_selection\_search.py in build_cv_graph(estimator, cv, scorer, candidate_params, X, y, groups, fit_params, iid, error_score, return_train_score, cache_cv)
201
202 dsk = {}
--> 203 X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
204 n_splits = compute_n_splits(cv, X, y, groups)
205
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask_ml\model_selection\utils.py in to_keys(dsk, *args)
85 yield x.key
86 else:
---> 87 assert not is_dask_collection(x)
88 key = "array-" + tokenize(x)
89 dsk[key] = x
AssertionError: