当我查看文档时,参数应该是“DMatrix”(xgboost 版本 1.5.0)。
表示我使用的版本几乎相同(在下面的文档链接中转到子标题“1.2.2 Python”):
https://xgboost.readthedocs.io/_/downloads/en/release_1.3.0/pdf/
我不明白为什么它应该是一个 DMatrix 时要求一个浮点参数。
我查看了所有具有字符串'TypeError:float()参数必须是字符串或数字,而不是......'的堆栈帖子,但它们都没有包含'DMatrix',我无法找到我可以适应这个特定问题的解决方案。
以下是引发此错误的代码(转到'clf - xgb.train(...)'):
def grid_search(timeout_seconds, cv_splits, num_boost_round):
# Read input data
X, y = preprocessing()
y.replace({1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11,13:12,14:13,
15:14,16:15,17:16,18:17,19:18,20:19,21:20,22:21}, inplace = True)
# Create dataframe to collect the results
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"]
test_id = 0
tests = pd.DataFrame(columns=tests_columns)
# Cross validation number of splits
kf = KFold(n_splits=cv_splits)
# Execute until timeout occurs
with timeout(timeout_seconds, exception=RuntimeError):
# Get the grid
grid_iter, keys, length = get_grid_iterable()
try:
# For every element of the grid
for df_grid in grid_iter:
# Prepare a list to collect the scores
score = []
params = dict(zip(keys, df_grid))
# The objective function
params["objective"] = "multi:softprob"
params['num_class'] = 22
print('X.reason_action_converted: ', X.reason_action_converted)
# For each fold, train XGBoost and spit out the results
for train_index, test_index in kf.split(X.values):
# Get X train and X test
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
**# Get y train and y test**
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Convert into DMatrix
d_train = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
d_valid = xgb.DMatrix(X_test, label=y_test, missing=np.NaN)
d_test = xgb.DMatrix(X_test, missing=np.NaN)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# Create the classifier using the current grid params. Apply early stopping of 50 rounds
'''clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)'''
**clf = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)**
y_hat = clf.predict(d_test)
# Append Scores on the fold kept out
score.append(r2_score(y_test, y_hat))
# Store the result into a dataframe
score_df = pd.DataFrame(columns=tests_columns, data=[
[test_id, np.mean(score), np.min(score), np.max(score), np.median(score),
json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]])
test_id += 1
tests = pd.concat([tests, score_df])
except RuntimeError:
# When timeout occurs an exception is raised and the main cycle is broken
pass
# Spit out the results
tests.to_csv("grid-search.csv", index=False)
print(tests)
**if __name__ == "__main__":
grid_search(timeout_seconds=3600, cv_splits=4, num_boost_round=500)**
错误信息:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-3902447645915365> in <module>
106
107 if __name__ == "__main__":
--> 108 grid_search(timeout_seconds=3600,
cv_splits=4, num_boost_round=500)
<command-3902447645915365> in grid_search(timeout_seconds, cv_splits, num_boost_round)
84 # Create the classifier using the current grid params. Apply early stopping of 50 rounds
85 '''clf = xgb.train(params,
d_train, boosting_rounds, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)'''
---> 86 clf = xgb.train(params,
d_train, num_boost_round, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)
87 y_hat = clf.predict(d_test)
88
/databricks/python/lib/python3.8/site-
packages/xgboost/training.py in train(params, dtrain,
num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval,
xgb_model, callbacks)
204 Booster : a trained booster model
205 """
--> 206 bst = _train_internal(params, dtrain,
207
num_boost_round=num_boost_round,
208 evals=evals,
/databricks/python/lib/python3.8/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
107 nboost += 1
108 # check evaluation result.
--> 109 if callbacks.after_iteration(bst, i,
dtrain, evals):
110 break
111 # do checkpoint after evaluation, in
case evaluation also updates
/databricks/python/lib/python3.8/site-
packages/xgboost/callback.py in after_iteration(self,
model, epoch, dtrain, evals)
421 for _, name in evals:
422 assert name.find('-') == -1,
'Dataset name should not contain `-`'
--> 423 score = model.eval_set(evals,
epoch, self.metric)
424 score = score.split()[1:] # into
datasets
425 # split up `test-error:0.1234`
/databricks/python/lib/python3.8/site-
packages/xgboost/core.py in eval_set(self, evals,
iteration, feval)
1350 if feval is not None:
1351 for dmat, evname in evals:
-> 1352 feval_ret =
feval(self.predict(dmat, training=False,
1353
output_margin=True), dmat)
1354 if isinstance(feval_ret, list):
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/metrics/_classification.py in
log_loss(y_true, y_pred, eps, normalize, sample_weight,
labels)
2184 The logarithm used is the natural logarithm
(base-e).
2185 """
-> 2186 y_pred = check_array(y_pred,
ensure_2d=False)
2187 check_consistent_length(y_pred, y_true,
sample_weight)
2188
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in
check_array(array, accept_sparse, accept_large_sparse,
dtype, order, copy, force_all_finite, ensure_2d,
allow_nd, ensure_min_samples, ensure_min_features,
estimator)
636 # make sure we actually converted to
numeric:
637 if dtype_numeric and array.dtype.kind
== "O":
--> 638 array = array.astype(np.float64)
639 if not allow_nd and array.ndim >= 3:
640 raise ValueError("Found array with
dim %d. %s expected <= 2."
TypeError: float() argument must be a string or a number, not 'DMatrix'
我正在使用 Databricks、Python 3.8.8 和 xgboost 1.3.1。
我正在尝试改编以下教程中的代码:Effortless Hyperparameters Tuning with Apache Spark。