我正在尝试使用 scikit-learn 创建一些学习曲线,这是代码:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
X_traina, X_testa, y_traina, y_testa = cross_validation.train_test_split(x, y, test_size=0.95, random_state=23)
params = {'min_samples_split': [8], 'max_depth': [20], 'min_samples_leaf': [1],'n_estimators':[200]}
cv = KFold(n=len(X_traina),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_traina, n_folds=5)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_traina,y_traina)
我得到的错误是:
ValueError: Found array with 0 sample(s) (shape=(0, 491)) while a minimum of 1 is required.
完整的追溯是:
ValueErrorTraceback (most recent call last)
<ipython-input-41-e0bb24870a22> in <module>()
1 gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
----> 2 gs.fit(X_traina,y_traina)
/opt/conda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/opt/conda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/opt/conda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
/opt/conda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1604 score = scorer(estimator, X_test)
1605 else:
-> 1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
/opt/conda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in _passthrough_scorer(estimator, *args, **kwargs)
203 def _passthrough_scorer(estimator, *args, **kwargs):
204 """Function that wraps estimator.score"""
--> 205 return estimator.score(*args, **kwargs)
206
207
/opt/conda/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
345
346 from .metrics import r2_score
--> 347 return r2_score(y, self.predict(X), sample_weight=sample_weight,
348 multioutput='variance_weighted')
349
/opt/conda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
647 """
648 # Check data
--> 649 X = self._validate_X_predict(X)
650
651 # Assign chunk of trees to jobs
/opt/conda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_X_predict(self, X)
317 "call `fit` before exploiting the model.")
318
--> 319 return self.estimators_[0]._validate_X_predict(X, check_input=True)
320
321 @property
/opt/conda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in _validate_X_predict(self, X, check_input)
363
364 if check_input:
--> 365 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
366 if issparse(X) and (X.indices.dtype != np.intc or
367 X.indptr.dtype != np.intc):
/opt/conda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
405 " minimum of %d is required%s."
406 % (n_samples, shape_repr, ensure_min_samples,
--> 407 context))
408
409 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 491)) while a minimum of 1 is required.
现在我不明白这一点,因为当我打印样品的形状时:
print (X_traina.shape, X_testa.shape, y_traina.shape, y_testa.shape)
我越来越:
((78, 491), (1489, 491), (78,), (1489,))
我如果打印cv_stratified
它看起来像这样的对象:
sklearn.cross_validation.StratifiedKFold(labels=[ 5.43 8.74 8.1 6.55 7.66 6.52 8.6 7.1 6.4 8.05 7.89 6.68
8.06 6.17 5.5 7.96 5.78 6. 7.74 5.83 6.51 6.31 6.68 9.22
6.07 7.06 7.12 8.64 5.72 6.4 7.64 5.74 7.41 6.49 6.81 7.1
7.66 6.68 7.05 6.28 5.49 6.35 6.9 6.2 7.51 5.65 9.3 5.84
6.92 5.75 6.92 8.8 7.04 5.81 5.73 5.31 7.13 7.66 6.98 5.93
8.24 6.96 8.22 7.27 7.34 5.91 5.57 6.5 7.28 6.74 4.92 6.88
5.8 9.15 6.63 6.37 8.66 6.4 ], n_folds=5, shuffle=False, random_state=None)
有趣的是,如果我将test_size
参数更改为0.88
它可以工作,这是它工作的最高值。对于此值,形状为:
((188, 491), (1379, 491), (188,), (1379,))
如果我StratifiedKFold
改为KFold
所以问题是 - 我应该在我的代码中进行哪些更改以使其也适用于test_size
设置0.95
?
我正在使用 scikit 0.17.1 版。