有没有一种方法可以在Sklearn或任何其他库中一次对多个估计器进行网格搜索。例如,我们可以在一次网格搜索中通过 SVM 和随机森林吗?
问问题
14801 次
5 回答
32
是的。例子:
pipeline = Pipeline([
('vect', CountVectorizer()),
('clf', SGDClassifier()),
])
parameters = [
{
'vect__max_df': (0.5, 0.75, 1.0),
'clf': (SGDClassifier(),),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
'clf__n_iter': (10, 50, 80),
}, {
'vect__max_df': (0.5, 0.75, 1.0),
'clf': (LinearSVC(),),
'clf__C': (0.01, 0.5, 1.0)
}
]
grid_search = GridSearchCV(pipeline, parameters)
于 2016-10-20T13:10:58.940 回答
16
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
class DummyEstimator(BaseEstimator):
def fit(self): pass
def score(self): pass
# Create a pipeline
pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator
# Candidate learning algorithms and their hyperparameters
search_space = [{'clf': [LogisticRegression()], # Actual Estimator
'clf__penalty': ['l1', 'l2'],
'clf__C': np.logspace(0, 4, 10)},
{'clf': [DecisionTreeClassifier()], # Actual Estimator
'clf__criterion': ['gini', 'entropy']}]
# Create grid search
gs = GridSearchCV(pipe, search_space)
于 2018-11-14T02:30:15.010 回答
12
我想你要找的是这个:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
names = [
"Naive Bayes",
"Linear SVM",
"Logistic Regression",
"Random Forest",
"Multilayer Perceptron"
]
classifiers = [
MultinomialNB(),
LinearSVC(),
LogisticRegression(),
RandomForestClassifier(),
MLPClassifier()
]
parameters = [
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (1e-2, 1e-3)},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__C': (np.logspace(-5, 1, 5))},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__C': (np.logspace(-5, 1, 5))},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__max_depth': (1, 2)},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (1e-2, 1e-3)}
]
for name, classifier, params in zip(names, classifiers, parameters):
clf_pipe = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf', classifier),
])
gs_clf = GridSearchCV(clf_pipe, param_grid=params, n_jobs=-1)
clf = gs_clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("{} score: {}".format(name, score))
于 2018-03-01T15:16:31.093 回答
3
您可以使用TransformedTargetRegressor。此类设计用于在拟合之前转换目标变量,以回归量和一组转换器作为参数。但是你可以不给变换器,然后应用恒等变换器(即不变换)。由于回归量是一个类参数,我们可以通过网格搜索对象来改变它。
import numpy as np
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
Y = np.array([1,2,3,4,5,6,7,8,9,10])
X = np.array([0,1,3,5,3,5,7,9,8,9]).reshape((-1, 1))
为了进行网格搜索,我们应该将 param_grid 指定为 dict 列表,每个用于不同的估计器。这是因为不同的估计器使用不同的参数集(例如设置fit_intercept
导致MLPRegressor
错误)。请注意,名称“regressor”会自动赋予回归器。
model = TransformedTargetRegressor()
params = [
{
"regressor": [LinearRegression()],
"regressor__fit_intercept": [True, False]
},
{
"regressor": [MLPRegressor()],
"regressor__hidden_layer_sizes": [1, 5, 10]
}
]
我们可以像往常一样适应。
g = GridSearchCV(model, params)
g.fit(X, Y)
g.best_estimator_, g.best_score_, g.best_params_
# results in like
(TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
regressor=LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
normalize=False),
transformer=None),
-0.419213380219391,
{'regressor': LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
normalize=False), 'regressor__fit_intercept': False})
于 2019-01-10T08:05:06.373 回答
1
您可以做的是创建一个接受任何分类器的类,并为每个分类器提供任何参数设置。
创建一个适用于任何估算器的切换器类
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
现在,您可以随意预训练 tfidf。
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data, labels)
现在使用这个预训练的 tfidf 创建一个管道
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('tfidf',tfidf), # Already pretrained/fit
('clf', ClfSwitcher()),
])
执行超参数优化
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, verbose=3)
# param optimization
gscv.fit(train_data, train_labels)
如何解释clf__estimator__loss
clf__estimator__loss
被解释loss
为无论estimator
是什么的参数,estimator = SGDClassifier()
在最上面的例子中,它本身就是一个参数,clf
它是一个ClfSwitcher
对象。
于 2018-12-25T23:02:21.197 回答