我试图弄清楚在特征选择之前删除高度相关和负相关的特征是否明智。这是我的代码的快照
def find_correlation(data, threshold=0.9, remove_negative=False):
corr_mat = data.corr()
if remove_negative:
corr_mat = np.abs(corr_mat)
corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
already_in = set()
result = []
for col in corr_mat:
perfect_corr = corr_mat[col][corr_mat[col] > threshold].index.tolist()
if perfect_corr and col not in already_in:
already_in.update(set(perfect_corr))
perfect_corr.append(col)
result.append(perfect_corr)
select_nested = [f[1:] for f in result]
select_flat = [i for j in select_nested for i in j]
return select_flat
corrFeatList = find_correlation(x)
fpd = x.drop(corrFeatList,axis = 1 )
fpd['label'] = catlabel
fpd = fpd[fpd['label'].notnull()]
Features = np.array(fpd.iloc[:,:-1])
Labels = np.array(fpd.iloc[:,-1])
hpd = fpd.iloc[:,:-1]
headerName = hpd.columns
#Scale first
#Scaling normalisation
scaler = preprocessing.StandardScaler()
Features = scaler.fit_transform(Features)
#RFECV logReg first
## Reshape the Label array
Labels = Labels.reshape(Labels.shape[0],)
## Set folds for nested cross validation
nr.seed(988)
feature_folds = ms.KFold(n_splits=10, shuffle = True)
## Define the model
logistic_mod = linear_model.LogisticRegression(C = 10, class_weight = "balanced")
## Perform feature selection by CV with high variance features only
nr.seed(6677)
selector = fs.RFECV(estimator = logistic_mod, cv = feature_folds)
selector = selector.fit(Features, Labels)
Features = selector.transform(Features)
print('Best features :', headerName[selector.support_])
因此,我尝试了删除相关功能和不删除相关功能,并获得了完全不同的功能。RFECV 和其他特征选择(降维方法)是否考虑了这些高度相关的特征?我在这里做正确的事吗?最后,如果删除高阈值特征是一个好主意,我应该在这样做之前进行缩放。谢谢你。
凯文