GridSearchCV
当使用LogisticRegression
更大的数据集时,我们能够(仅在我们的应用程序 atm 的上下文中)在 Ubuntu 15.04 和 OS X 上使用 scikit 0.17 重现以下问题。
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('cpencoder', <cpml.whitebox.Lin...s', refit=True, scoring=u'roc_auc', verbose=1))]), X= Unnamed: 0 member_id loan_a... 42.993346
[152536 rows x 45 columns], y=array([0, 1, 0, ..., 1, 1, 0]), **fit_params={})
160 y : iterable, default=None
161 Training targets. Must fulfill label requirements for all steps of
162 the pipeline.
163 """
164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
--> 165 self.steps[-1][-1].fit(Xt, y, **fit_params)
self.steps.fit = undefined
Xt = array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]])
y = array([0, 1, 0, ..., 1, 1, 0])
fit_params = {}
166 return self
167
168 def fit_transform(self, X, y=None, **fit_params):
169 """Fit all the transforms one after the other and transform the
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring=u'roc_auc', verbose=1), X=array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=array([0, 1, 0, ..., 1, 1, 0]))
799 y : array-like, shape = [n_samples] or [n_samples, n_output], optional
800 Target relative to X for classification or regression;
801 None for unsupervised learning.
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
self._fit = <bound method GridSearchCV._fit of GridSearchCV(...obs', refit=True, scoring=u'roc_auc', verbose=1)>
X = array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]])
y = array([0, 1, 0, ..., 1, 1, 0])
self.param_grid = {'C': [1], 'class_weight': ['auto'], 'fit_intercept': [False], 'intercept_scaling': [1], 'penalty': ['l2']}
805
806
807 class RandomizedSearchCV(BaseSearchCV):
808 """Randomized search on hyper parameters.
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring=u'roc_auc', verbose=1), X=array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=array([0, 1, 0, ..., 1, 1, 0]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
548 )(
549 delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
550 train, test, self.verbose, parameters,
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
parameters = undefined
parameter_iterable = <sklearn.grid_search.ParameterGrid object>
554 for train, test in cv)
555
556 # Out is a list of triplet: score, estimator, n_test_samples
557 n_fits = len(out)
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=2), iterable=<generator object <genexpr>>)
807 if pre_dispatch == "all" or n_jobs == 1:
808 # The iterable was consumed all at once by the above for loop.
809 # No need to wait for async callbacks to trigger to
810 # consumption.
811 self._iterating = False
--> 812 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=2)>
813 # Make sure that we get a last message telling us we are done
814 elapsed_time = time.time() - self._start_time
815 self._print('Done %3i out of %3i | elapsed: %s finished',
816 (len(self._output), len(self._output),
---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError Mon Jan 18 11:58:09 2016
PID: 71840 Python 2.7.10: /Users/samuelhopkins/.virtualenvs/cpml/bin/python
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
67 def __init__(self, iterator_slice):
68 self.items = list(iterator_slice)
69 self._size = len(self.items)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
75 return self._size
76
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=LogisticRegression(C=1, class_weight='auto', dua... tol=0.0001, verbose=0, warm_start=False), X=memmap([[ 0.00000000e+00, 1.29659900e+06, 5...000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=memmap([0, 1, 0, ..., 1, 1, 0]), scorer=make_scorer(roc_auc_score, needs_threshold=True), train=array([ 49100, 49101, 49102, ..., 152533, 152534, 152535]), test=array([ 0, 1, 2, ..., 57517, 57522, 57532]), verbose=1, parameters={'C': 1, 'class_weight': 'auto', 'fit_intercept': False, 'intercept_scaling': 1, 'penalty': 'l2'}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
1545 " numeric value. (Hint: if using 'raise', please"
1546 " make sure that it has been spelled correctly.)"
1547 )
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
1553
1554 scoring_time = time.time() - start_time
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator=LogisticRegression(C=1, class_weight='auto', dua... tol=0.0001, verbose=0, warm_start=False), X_test=memmap([[ 0.00000000e+00, 1.29659900e+06, 5...000000e+01, 0.00000000e+00, 4.29933458e+01]]), y_test=memmap([0, 1, 0, ..., 1, 1, 1]), scorer=make_scorer(roc_auc_score, needs_threshold=True))
1604 score = scorer(estimator, X_test)
1605 else:
1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
-> 1609 % (str(score), type(score)))
1610 return score
1611
1612
1613 def _permutation_test_score(estimator, X, y, cv, scorer):
ValueError: scoring must return a number, got 0.998981811748 (<class 'numpy.core.memmap.memmap'>) instead.
我们已经多次尝试在应用程序的上下文之外重现它,但没有任何运气。我们对它进行了以下更改cross_validation.py
并解决了我们的特定问题:
...
if isinstance(score, np.core.memmap):
score = np.float(score)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s) instead."
...
更多信息:
- 我们在 python 2.7
- 我们使用 a
Pipeline
来确保所有输入都是数字
我的问题如下:
- 我们如何去重现这个问题以导致
scorer
返回 amemmap
? - 还有其他人有这个特殊问题吗?
- 我们所做的改变
cross_validation.py
实际上是一个体面的解决方案吗?