0

我正在尝试使用 scikit-learn 创建一些学习曲线,这是代码:

from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV

X_traina, X_testa, y_traina, y_testa = cross_validation.train_test_split(x, y, test_size=0.95, random_state=23)

params = {'min_samples_split': [8], 'max_depth': [20], 'min_samples_leaf': [1],'n_estimators':[200]}
cv = KFold(n=len(X_traina),n_folds=10,shuffle=True)
cv_stratified = StratifiedKFold(y_traina, n_folds=5)
gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
gs.fit(X_traina,y_traina)

我得到的错误是:

ValueError: Found array with 0 sample(s) (shape=(0, 491)) while a minimum of 1 is required.

完整的追溯是:

ValueErrorTraceback (most recent call last)
<ipython-input-41-e0bb24870a22> in <module>()
      1 gs = GridSearchCV(custom_forest, params, cv=cv_stratified,verbose=1,refit=True)
----> 2 gs.fit(X_traina,y_traina)

/opt/conda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
    805 
    806 

/opt/conda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
    554                 for train, test in cv)
    555 

/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
    801                 self._iterating = True
    802             else:

/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    656                 return False
    657             else:
--> 658                 self._dispatch(tasks)
    659                 return True
    660 

/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    564 
    565         if self._pool is None:
--> 566             job = ImmediateComputeBatch(batch)
    567             self._jobs.append(job)
    568             self.n_dispatched_batches += 1

/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    178         # Don't delay the application, to avoid keeping the input
    179         # arguments in memory
--> 180         self.results = batch()
    181 
    182     def get(self):

/opt/conda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

 /opt/conda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1548 
   1549     else:
-> 1550         test_score = _score(estimator, X_test, y_test, scorer)
   1551         if return_train_score:
   1552             train_score = _score(estimator, X_train, y_train, scorer)

/opt/conda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
   1604         score = scorer(estimator, X_test)
   1605     else:
-> 1606         score = scorer(estimator, X_test, y_test)
   1607     if not isinstance(score, numbers.Number):
   1608         raise ValueError("scoring must return a number, got %s (%s) instead."

/opt/conda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in _passthrough_scorer(estimator, *args, **kwargs)
    203 def _passthrough_scorer(estimator, *args, **kwargs):
    204     """Function that wraps estimator.score"""
--> 205     return estimator.score(*args, **kwargs)
    206 
    207 

/opt/conda/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
    345 
    346         from .metrics import r2_score
--> 347         return r2_score(y, self.predict(X), sample_weight=sample_weight,
    348                         multioutput='variance_weighted')
    349 

/opt/conda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
    647         """
    648         # Check data
--> 649         X = self._validate_X_predict(X)
    650 
    651         # Assign chunk of trees to jobs

/opt/conda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_X_predict(self, X)
    317                                  "call `fit` before exploiting the model.")
    318 
--> 319         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    320 
    321     @property

/opt/conda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in _validate_X_predict(self, X, check_input)
    363 
    364         if check_input:
--> 365             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    366             if issparse(X) and (X.indices.dtype != np.intc or
    367                                 X.indptr.dtype != np.intc):

/opt/conda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              " minimum of %d is required%s."
    406                              % (n_samples, shape_repr, ensure_min_samples,
--> 407                                 context))
    408 
    409     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 491)) while a minimum of 1 is required.

现在我不明白这一点,因为当我打印样品的形状时:

print (X_traina.shape, X_testa.shape, y_traina.shape, y_testa.shape)

我越来越:

((78, 491), (1489, 491), (78,), (1489,))

我如果打印cv_stratified它看起来像这样的对象:

sklearn.cross_validation.StratifiedKFold(labels=[ 5.43  8.74  8.1   6.55  7.66  6.52  8.6   7.1   6.4   8.05  7.89  6.68
  8.06  6.17  5.5   7.96  5.78  6.    7.74  5.83  6.51  6.31  6.68  9.22
  6.07  7.06  7.12  8.64  5.72  6.4   7.64  5.74  7.41  6.49  6.81  7.1
  7.66  6.68  7.05  6.28  5.49  6.35  6.9   6.2   7.51  5.65  9.3   5.84
  6.92  5.75  6.92  8.8   7.04  5.81  5.73  5.31  7.13  7.66  6.98  5.93
  8.24  6.96  8.22  7.27  7.34  5.91  5.57  6.5   7.28  6.74  4.92  6.88
  5.8   9.15  6.63  6.37  8.66  6.4 ], n_folds=5, shuffle=False, random_state=None)

有趣的是,如果我将test_size参数更改为0.88它可以工作,这是它工作的最高值。对于此值,形状为:

((188, 491), (1379, 491), (188,), (1379,))

如果我StratifiedKFold改为KFold

所以问题是 - 我应该在我的代码中进行哪些更改以使其也适用于test_size设置0.95

我正在使用 scikit 0.17.1 版。

4

0 回答 0