我正在研究应用内核主成分分析 (KPCA) 来降低我的特征矩阵集的维数,从而获得一组数据点。我浏览了 scikit learn 包中 KPCA 中使用的参数,并了解到如果选择其中一个参数应该起作用(例如,如果选择了 gamma,则不使用度数和系数)。此外,我通过以下链接查看了用于分类模型的超参数方法:
我尝试编写 hyperopt 代码并将其与 KPCA 结合起来,但是,在处理 PCA 模型评分的领域中,我不断遇到错误。我知道 KPCA 没有分数才能找到 PCA 模型的准确性,那么,我该如何克服这个错误呢?我尝试了几种评分方法,但我从 inverse_fit 或数组大小中得到错误。请在下面找到代码和错误消息。
代码:
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, IncrementalPCA
from hyperopt import hp, tpe, atpe, fmin, Trials, rand, STATUS_OK
# Implementing Hyperparamater method:
models = {'pca' : PCA,
'kpca' : KernelPCA,
'spca' : SparsePCA,
# 'ipca' : IncrementalPCA
}
def search_space(model):
# Initialising variables:
model = model.lower()
space = {}
# Calling the models:
if model == 'pca':
space = {'svd_solver' : hp.choice('svd_solver', ["auto", "full", "arpack", "randomized"]),
}
elif model == 'kpca':
space = {'kernel' : hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed']),
'gamma' : hp.choice('gamma', np.arange(0.03, 0.05, 0.002)),
'degree' : hp.choice('degree', range(1, 10, 1)),
'coef0' : hp.choice('coef0', np.arange(1, 10, 0.2))
}
elif model == 'spca':
space = {'alpha' : hp.choice('alpha', np.arange(1.0, 15.0, 0.2)),
'ridge_alpha' : hp.choice('ridge_alpha', np.linspace(0.01, 0.3, 30)),
'method' : hp.choice('method', ['lars', 'cd']),
'max_iter' : hp.choice('max_iter', [1000, 1500, 2000, 2500, 3000])
}
# elif model == 'ipca':
# space = {'batch_size' : hp.choice('batch_size', ['gini', 'entropy']),
# }
space['model'] = model
return space
def obj_fnc(params):
model = params.get('model').lower()
# X_ = scale_normalize(params, X[:])
del params['model']
clf = models[model](**params)
return (get_acc_status(clf, X))
def get_acc_status(clf, X):
X_reduced = clf.fit_transform(X)
# X_prereduced = clf.fit_inverse_transform(X_reduced)
# acc = -1 * mean_squared_error(X, X_prereduced)
X_prereduced = clf.inverse_transform(X_reduced)
# acc = -1 * mean_absolute_error(X, X_prereduced)
acc = -1 * r2_score(X, X_prereduced)
# acc = cross_val_score(clf, X).mean()
return {'loss': -acc, 'status': STATUS_OK}
##### Hyperparameter optimisation:
# Running Bayesian Optimisation to get the best parameters:
start = time.time()
# Create the algorithms
tpe_algo = tpe.suggest
# rand_algo = rand.suggest
# atpe_algo = atpe.suggest
# Assigning model:
model = 'kpca'
# Creating the trial objects:
hypopt_trials = Trials()
# Getting the best parameters:
best_params = fmin(obj_fnc, search_space(model), algo=tpe_algo, max_evals=500, trials=hypopt_trials)
print("Best params: ", best_params)
print('Best accuracy: ', hypopt_trials.best_trial['result']['loss'])
print("[INFO] Baye. Opt. search took {:.2f} seconds".format(time.time() - start))
# Calling parameters:
## PCA:
svd_solver = ["auto", "full", "arpack", "randomized"]
## KPCA:
kernel = ["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"]
gamma = np.arange(0.1, 0.9, 0.01)
degree = range(1, 10, 1)
coef0 = np.arange(1, 10, 0.2)
kernel_gamma = ["poly", "rbf", "sigmoid"]
kernel_degree = "poly"
kernel_coef0 = "sigmoid"
## SPCA:
alpha = np.arange(1.0, 15.0, 0.2)
ridge_alpha = np.linspace(0.01, 0.3, 30)
method = ['lars', 'cd']
max_iter = [1000, 1500, 2000, 2500, 3000]
# Creating the PCA models:
# pca = PCA(n_components=2, svd_solver=svd_solver[best_params['svd_solver'])
if any(x in best_params for x in kernel_gamma):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]))
if any(x in best_params for x in kernel_degree):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), degree='{0}'.format(degree[best_params['degree']]), coef0='{0}'.format(coef0[best_params['coef0']]))
if any(x in best_params for x in kernel_coef0):
pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), coef0='{0}'.format(coef0[best_params['coef0']]))
# pca = SparsePCA(n_components=2, alpha='{0}'.format(alpha[best_params['alpha']]), ridge_alpha='{0}'.format(ridge_alpha[best_params['ridge_alpha']]), method=method[best_params['method']], max_iter='{0}'.format(max_iter[best_params['max_iter']]))
# pca = IncrementalPCA(n_components=2)
print('Model: ', pca)
PrincipalComponents = pca.fit_transform(X_std)
principalDf = pd.DataFrame(data = PrincipalComponents, columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, dataframe[['Label']]], axis = 1)
print('Principal Component Analysis: ')
print(principalDf)
错误信息:
错误信息 (1):
ValueError: There are significant negative eigenvalues (1.11715 of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.
错误信息 (2):
ValueError: Precomputed metric requires shape (n_queries, n_indexed). Got (50, 14) for 50 indexed.