我想在泰坦尼克号数据集中做出预测。
我想尝试 catboost 并遵循本指南:https ://www.analyticsvidhya.com/blog/2017/08/catboost-automated-categorical-data/ 但是当我尝试复制时它不起作用
我尝试按照指南进行操作,我认为 catboost 会处理所有数据转换,因为您可以在指南中看到他正在使用 dtypes object,float,int
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train.replace(np.nan, '', regex=True)
train.fillna(-999, inplace=True)
test.fillna(-999,inplace=True)
categorical_features_indices = np.where(X.dtypes != np.float)[0]
train = train.replace(np.nan, '', regex=True)
y= train.Survived
train_features=['Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X= train[train_features]
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)
#importing library and building model
from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
我收到以下错误:
CatBoostError Traceback (most recent call last)
_catboost.pyx in _catboost.get_cat_factor_bytes_representation()
_catboost.pyx in _catboost.get_id_object_bytes_string_representation()
CatBoostError: bad object for id: 40.5
During handling of the above exception, another exception occurred:
CatBoostError Traceback (most recent call last)
<ipython-input-103-94f3a250223b> in <module>
2 from catboost import CatBoostRegressor
3 model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
----> 4 model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
~\Anaconda3\lib\site-packages\catboost\core.py in fit(self, X, y, cat_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
3359 use_best_model, eval_set, verbose, logging_level, plot, column_description,
3360 verbose_eval, metric_period, silent, early_stopping_rounds,
-> 3361 save_snapshot, snapshot_file, snapshot_interval, init_model)
3362
3363 def predict(self, data, ntree_start=0, ntree_end=0, thread_count=-1, verbose=None):
~\Anaconda3\lib\site-packages\catboost\core.py in _fit(self, X, y, cat_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
1235 _check_train_params(params)
1236
-> 1237 train_pool = _build_train_pool(X, y, cat_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, column_description)
1238 if train_pool.is_empty_:
1239 raise CatBoostError("X is empty.")
~\Anaconda3\lib\site-packages\catboost\core.py in _build_train_pool(X, y, cat_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, column_description)
693 raise CatBoostError("y has not initialized in fit(): X is not catboost.Pool object, y must be not None in fit().")
694 train_pool = Pool(X, y, cat_features=cat_features, pairs=pairs, weight=sample_weight, group_id=group_id,
--> 695 group_weight=group_weight, subgroup_id=subgroup_id, pairs_weight=pairs_weight, baseline=baseline)
696 return train_pool
697
~\Anaconda3\lib\site-packages\catboost\core.py in __init__(self, data, label, cat_features, column_description, pairs, delimiter, has_header, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count)
322 )
323
--> 324 self._init(data, label, cat_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names)
325 super(Pool, self).__init__()
326
~\Anaconda3\lib\site-packages\catboost\core.py in _init(self, data, label, cat_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names)
674 baseline = np.reshape(baseline, (samples_count, -1))
675 self._check_baseline_shape(baseline, samples_count)
--> 676 self._init_pool(data, label, cat_features, pairs, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names)
677
678
_catboost.pyx in _catboost._PoolBase._init_pool()
_catboost.pyx in _catboost._PoolBase._init_pool()
_catboost.pyx in _catboost._PoolBase._init_features_order_layout_pool()
_catboost.pyx in _catboost._set_features_order_data_pd_data_frame()
_catboost.pyx in _catboost.get_cat_factor_bytes_representation()
CatBoostError: Invalid type for cat_feature[7,4]=40.5 : cat_features must be integer or string, real number values and NaN values should be converted to string.