我正在研究一种光谱聚类算法的变体,该算法多次对更新的相似性矩阵执行标准光谱嵌入W
,n_iters
因此我必须检查在特征分解之前L_rw
通过更新W
每次迭代生成的归一化拉普拉斯矩阵的 PSD。问题是,即使初始化L_rw
在理论上总是 PSD,arpack
有时也会产生负特征值(有时并非总是如此)。总之,arpack
并不总是保证半定对称矩阵的非负特征值。下面是示例代码来说明我遇到的问题。
import numpy as np
import scipy.sparse as sp
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse.linalg import arpack
def my_laplacian(W):
""" Generate normed laplacian matrix L_rw for similarity matrix W
(L_rw is according to the definition in `A Tutorial on Spectral
Clustering` by Von Luxburg, 2007)
"""
D = gen_D(W)
if not np.all(D != 0):
raise ValueError('W exists all-zero row')
L_rw = normalize_L(W, D)
if not isPSD(L_rw):
raise ValueError('L_rw isn\'t PSD')
return L_rw
def gen_D(W):
if sp.issparse(W):
return np.asarray(W.sum(axis = 0)).squeeze()
else:
return W.sum(axis = 0)
def normalize_L(W, D):
""" Normalize laplacian L with formula L_rw = I - D ^ (-1) * W
"""
tmp = np.reciprocal(D)
if sp.issparse(W):
return sp.eye(W.shape[0], format = 'csr') - safe_sparse_dot(sp.diags(tmp, 0).tocsr(), W)
else:
return np.fill_diagonal(-W, 1 - np.einsum('i,i->i', tmp, W.diagonal()))
def isPSD(L, tol = 1e-10):
""" Check wheter L is positive semi-definite
"""
vals_large, vecs_large = arpack.eigsh(L, k = 10, which = 'LM', maxiter = 6000)
vals_small, vecs_small = arpack.eigsh(L, k = 10, sigma = 0, which = 'LM', maxiter = 6000)
return np.all(vals_small > - tol) and np.all(vals_large > -tol)
当我运行它时:
In [2]: X = np.random.random_integers(0, 500, size = (500, 1200))
In [3]: from sklearn.metrics.pairwise import *
In [4]: additive_chi2_kernel(X)
Out[4]:
array([[ -0. , -108450.9702963 , -110301.71485863, ...,
-114578.16996935, -106849.35599894, -102370.37161802],
[-108450.9702963 , -0. , -109129.14648767, ...,
-112274.78420127, -107293.6284921 , -103749.55524937],
[-110301.71485863, -109129.14648767, -0. , ...,
-109282.25057724, -99851.02992075, -103023.45281752],
...,
[-114578.16996935, -112274.78420127, -109282.25057724, ...,
-0. , -110133.11568012, -111218.58079982],
[-106849.35599894, -107293.6284921 , -99851.02992075, ...,
-110133.11568012, -0. , -102516.65009177],
[-102370.37161802, -103749.55524937, -103023.45281752, ...,
-111218.58079982, -102516.65009177, -0. ]])
In [5]: W = np.exp(-_ * (1. / X.shape[1]))
In [6]: W = sp.coo_matrix(W).tocsr()
In [7]: my_laplacian(W)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-9b100976a310> in <module>()
----> 1 my_laplacian(W)
<ipython-input-1-dbf198146761> in my_laplacian(W)
16 L_rw = normalize_L(W, D)
17 if not isPSD(L_rw):
---> 18 raise ValueError('L_rw isn\'t PSD')
19 return L_rw
ValueError: L_rw isn't PSD
然后我pdb.run('L_rw = my_laplacian(W)')
,但它没有ValueError
(Pdb) n
> <ipython-input-1-dbf198146761>(17)my_laplacian()
-> if not isPSD(L_rw):
(Pdb) L_rw
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
(Pdb) n
> <ipython-input-1-dbf198146761>(19)my_laplacian()
-> return L_rw
我又跑了一遍:
In [9]: my_laplacian(W)
Out[9]:
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
In [10]: my_laplacian(W)
Out[10]:
<500x500 sparse matrix of type '<type 'numpy.float64'>'
with 250000 stored elements in Compressed Sparse Row format>
In [11]: my_laplacian(W)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-9b100976a310> in <module>()
----> 1 my_laplacian(W)
<ipython-input-1-dbf198146761> in my_laplacian(W)
16 L_rw = normalize_L(W, D)
17 if not isPSD(L_rw):
---> 18 raise ValueError('L_rw isn\'t PSD')
19 return L_rw
好像arpack
不是很稳定?还是我弄错了什么?有没有人可以解释一下?