请协助。
我收到错误消息:ValueError: Array 'mean' must be a vector of length 4
我的代码如下:
#初始化:
def initialize(data, K):
mu_0 = np.mean(data, 0)
cov = np.cov(data.T)
init_mus = mvn.rvs(size=K, mean=mu_0, cov=cov)
init_covs = np.tile(cov,(K,1,1))
init_pi = np.ones(K) / K
return init_mus, init_covs, init_pi
#E步
from scipy import stats
def e_step(data, mus, covs, pi):
np.log([stats.multivariate_normal(mus, covs).pdf(data)])
log_p_y_x = np.log([1-pi, pi])[np.newaxis, ...] + \
np.log([stats.multivariate_normal(mus, covs).pdf(data)]).T
log_p_y_x_norm = logsumexp(log_p_y_x, axis=1)
return log_gammas
#M 步
def m_step(data, mus, covs, pi):
total_count = data.shape[0]
_, heuristics = e_step(data, mus, covs, pi)
heuristic1 = heuristics[:, 1]
sum_heuristic1 = np.sum(heuristic1)
pi = (sum_heuristic1/total_count)
mus = (heuristic1[..., np.newaxis].T.dot(data)/sum_heuristic1).flatten()
diff1 = data - mus
covs = diff1.T.dot(diff1 * heuristic1[..., np.newaxis]) / sum_heuristic1
return mus, covs, pi
#对数似然
def log_likelihood(data, mus, covs, pi,k):
k=int
loglikelihood, _ = e_step(data, mus, covs, pi)
ll= np(loglikelihood)
return ll
#期望最大化
def EM(data, K, max_iters):
init_mus, init_covs, init_pi = initialize(data, K)
mus = init_mus
covs = init_covs
pi = init_pi
lls = []
ll_prev = -np.inf
for i in range(max_iters):
k=int
ll =log_likelihood(data,mus, covs, pi,k)
lls.append(ll)
return mus, covs, pi, log_gammas, lls
#训练
def fit_gmm(data, K, num_restarts = 300):
ll_best = -np.inf
for i in range(num_restarts):
mus, covs, pi, log_gammas, lls = EM(data, K, max_iters=1000)
if lls[-1] > ll_best:
mus_best = mus
covs_best = covs
pi_best = pi
log_gammas_best = log_gammas
ll_best = lls[-1]
lls_best = lls
return mus_best, covs_best, pi_best, log_gammas_best, lls_best
fig, axes = plt.subplots(3, 4, figsize=(10, 8), sharey='row')
print("================= data 1 ==================")
data1_GMM_results = []
for K in range(2, 6):
mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data1, K, num_restarts = 10)
cluster_assignments = np.argmax(log_gammas_best, axis=1)
plot_clusters(data1, mus_best, covs_best, labels1, K, axes[0,K-2])
data1_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
print("================= data 2 ==================")
data2_GMM_results = []
for K in range(2, 6):
mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data2, K, num_restarts = 10)
cluster_assignments = np.argmax(log_gammas_best, axis=1)
plot_clusters(data2, mus_best, covs_best, labels2, K, axes[1,K-2])
data2_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
print("================= data 3 ==================")
data3_GMM_results = []
for K in range(2, 6):
mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data3, K, num_restarts = 10)
cluster_assignments = np.argmax(log_gammas_best, axis=1)
plot_clusters(data3, mus_best, covs_best, labels3, K, axes[2,K-2])
data3_GMM_results.append((mus_best, covs_best, cluster_assignments, LLs_best[-1]))
for K in range(2, 6):
axes[0,K-2].set_title("K = %d" % K)
for i in range(1, 4):
axes[i-1,0].set_ylabel("Dataset %d" % i)
plt.tight_layout()
错误:
================= data 1 ==================
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-e50708c0dbad> in <module>
4 data1_GMM_results = []
5 for K in range(2, 6):
----> 6 mus_best, covs_best, pi_best, log_gammas_best, LLs_best = fit_gmm(data1, K, num_restarts = 10)
7 cluster_assignments = np.argmax(log_gammas_best, axis=1)
8 plot_clusters(data1, mus_best, covs_best, labels1, K, axes[0,K-2])
<ipython-input-10-114ce239b4c1> in fit_gmm(data, K, num_restarts)
6 ll_best = -np.inf
7 for i in range(num_restarts):
----> 8 mus, covs, pi, log_gammas, lls = EM(data, K, max_iters=1000)
9 if lls[-1] > ll_best:
10 mus_best = mus
<ipython-input-9-4e5aa1b04da1> in EM(data, K, max_iters)
20 for i in range(max_iters):
21 k=int
---> 22 ll =log_likelihood(data,mus, covs, pi,k)
23 lls.append(ll)
24 return mus, covs, pi, log_gammas, lls
<ipython-input-8-7c3f0c433124> in log_likelihood(data, mus, covs, pi, k)
8 """
9 k=int
---> 10 loglikelihood, _ = e_step(data, mus, covs, pi)
11 ll= np(loglikelihood)
12 return ll
<ipython-input-6-400f900216a0> in e_step(data, mus, covs, pi)
7 log_gammas: N x K, the matrix specifying the log probability of each point belonging to cluster k
8 """
----> 9 np.log([stats.multivariate_normal(mus, covs).pdf(data)])
10 log_p_y_x = np.log([1-pi, pi])[np.newaxis, ...] + \
11 np.log([stats.multivariate_normal(mus, covs).pdf(data)]).T
~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in __call__(self, mean, cov, allow_singular, seed)
361 return multivariate_normal_frozen(mean, cov,
362 allow_singular=allow_singular,
--> 363 seed=seed)
364
365 def _process_parameters(self, dim, mean, cov):
~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in __init__(self, mean, cov, allow_singular, seed, maxpts, abseps, releps)
733 self._dist = multivariate_normal_gen(seed)
734 self.dim, self.mean, self.cov = self._dist._process_parameters(
--> 735 None, mean, cov)
736 self.cov_info = _PSD(self.cov, allow_singular=allow_singular)
737 if not maxpts:
~\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py in _process_parameters(self, dim, mean, cov)
405 if mean.ndim != 1 or mean.shape[0] != dim:
406 raise ValueError("Array 'mean' must be a vector of length %d." %
--> 407 dim)
408 if cov.ndim == 0:
409 cov = cov * np.eye(dim)
ValueError: Array 'mean' must be a vector of length 4.