您可以按照此答案func(estimator, X, y)
中的建议定义带有签名的自定义记分器。在您的情况下,自定义记分器定义将是:
import numpy as np
def r2_score_adj(estimator, X, y):
y_pred = estimator.predict(X)
if estimator.fit_intercept:
rsquared = 1 - np.nansum((y - y_pred) ** 2) / np.nansum((y - np.nanmean(y)) ** 2)
rsquared_adj = 1 - (X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1) * (1 - rsquared)
else:
rsquared = 1 - np.nansum((y - y_pred) ** 2) / np.nansum(y ** 2)
rsquared_adj = 1 - X.shape[0] / (X.shape[0] - X.shape[1]) * (1 - rsquared)
return rsquared_adj
这相当于statsmodel 调整后的 R 平方定义:
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_samples=100, n_features=10, noise=50, random_state=42)
# statsmodels with intercept
reg1 = sm.OLS(exog=sm.add_constant(X), endog=y).fit()
print(reg1.rsquared_adj)
# 0.9313017447410593
# scikit-learn with intercept
reg2 = LinearRegression(fit_intercept=True).fit(X, y)
print(r2_score_adj(reg2, X, y))
# 0.9313017447410593
# statsmodels without intercept
reg3 = sm.OLS(exog=X, endog=y).fit()
print(reg3.rsquared_adj)
# 0.9307276380801821
# scikit-learn without intercept
reg4 = LinearRegression(fit_intercept=False).fit(X, y)
print(r2_score_adj(reg4, X, y))
# 0.930727638080182
然后,您可以在mlxtend
's中使用自定义记分器,SequentialFeatureSelector
如下所示:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(
estimator=LinearRegression(),
k_features=2,
forward=True,
scoring=r2_score_adj,
cv=2
)
sfs.fit(X, y)
print(sfs.subsets_)
# {1: {'feature_idx': (4,), 'cv_scores': array([0.11337299, 0.11996526]), 'avg_score': 0.1166691229065483, 'feature_names': ('4',)}, 2: {'feature_idx': (4, 9), 'cv_scores': array([0.20589701, 0.38117558]), 'avg_score': 0.2935362938943045, 'feature_names': ('4', '9')}}