我在下面的代码中使用具有不同标签数组的相同数据集来计算 SSE:
import sklearn
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import os
import pandas as pd
import xlrd
import pickle
import csv
from numpy import savetxt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import matplotlib.cm as cm
n_cluster = 8
final = []
array_final = []
label = []
acabou = []
for c in range (0,n_cluster):
for n in range (2,7):
xlsx = pd.ExcelFile("C:/Users/guilh/Desktop/SENSORIAMENTO/SENTINEL_2/{}/Cluster_{}/MEDIANA_BANDA_CLUSTER{}{}.xlsx".format(n_cluster,c,n,c))
df = pd.read_excel(xlsx, 'Sheet1', keep_default_na = False)
numpy_array = df.to_numpy()
one_column_array = numpy_array.flatten(order="A")
one_column_array = [var for var in one_column_array if var]
final = one_column_array
array_final.append(np.char.replace(final, ',', '.'))
#criando uma array com o número de cada Cluster
x = len(one_column_array)
acabou.append(np.stack(array_final, axis=-1))
seila = np.empty(x, np.int64 , order='C')
seila.fill(c+1)
label.append(seila)
array_final = []
acabou_valor = np.concatenate(acabou)
acabou_label = np.concatenate(label)
#CALCULANDO SSE
BANDA = []
for i in range (0,5):
aa = acabou_valor[:,i].astype(np.float)
BANDA.append(aa)
soma = []
#SSE for each cluster in each band
for i in range (0,n_cluster):
for j in range (0,5):
aa = BANDA[j][np.nonzero(acabou_label == i+1)]
mm = np.mean(aa)
bb = np.square(np.subtract(aa, mm))
cc = bb.sum()
soma.append(cc)
cu = []
cu2 = []
cu3 = []
cu4 = []
cu5 = []
cu6 = []
cu7 = []
cu8 = []
for i in range(0,5):
#print(soma[i])
a = soma[i]
cu.append(a)
cluster1 = []
cluster1.append(sum(cu))
for i in range(5,10):
#print(soma[i])
a = soma[i]
cu2.append(a)
cluster2 = []
cluster2.append(sum(cu2))
for i in range(10,15):
#print(soma[i])
a = soma[i]
cu3.append(a)
cluster3 = []
cluster3.append(sum(cu3))
for i in range(15,20):
#print(soma[i])
a = soma[i]
cu4.append(a)
cluster4 = []
cluster4.append(sum(cu4))
for i in range(20,25):
#print(soma[i])
a = soma[i]
cu5.append(a)
cluster5 = []
cluster5.append(sum(cu5))
for i in range(25,30):
#print(soma[i])
a = soma[i]
cu6.append(a)
cluster6 = []
cluster6.append(sum(cu6))
for i in range(30,35):
#print(soma[i])
a = soma[i]
cu7.append(a)
cluster7 = []
cluster7.append(sum(cu7))
for i in range(35,40):
#print(soma[i])
a = soma[i]
cu8.append(a)
cluster8 = []
cluster8.append(sum(cu8))
SSE_KMEANS = [cluster1[i]+cluster2[i]+cluster3[i]+cluster4[i]+cluster5[i]+cluster6[i]+cluster7[i]+cluster8[i] for i in range(len(cluster1))]
print(SSE_KMEANS)
print(cluster1)
print(cluster2)
print(cluster3)
print(cluster4)
print(cluster5)
print(cluster6)
print(cluster7)
print(cluster8)
一切都很好,随着 K 的上升(如预期的那样),SSE 正在下降,直到我尝试用 K = 8 来计算它,它相对于 K = 7 增加了近 50。我检查了很多次代码,看看是否有任何计算错误但找不到任何东西。有人可以帮助我吗?
代码的第一部分用于从转换为 .xls 的栅格创建数据集,我什至能够用它计算每个集群的轮廓索引,所以我认为那里一切都很好。第二部分是上证所。
K 均值聚类已经在 GEE 上完成,我只是在 python 上处理导出的数据。
我正在使用 Sentinel-2 图像中的聚类;通过 anaconda 使用 Spyder;蟒蛇版本 3.8.3。