我的目标是根据 xy 点的接近度对它们进行聚类。具体来说,将彼此靠近的点进行分组。我也希望使用一个单独的参考点来对数据进行聚类。
注意:我有多组需要独立聚类的数据。例如,使用下面的每个唯一值Item
表示一组不同的数据。我可以有多个独特的数据集,它们的稀疏性都不同。因此,任何通过预定数量的集群的技术都是不现实的,因为我每次都必须手动检查拟合并调整适当的参数。
因此,迄今为止最好的方法是某种形式的密度聚类(DBSCAN、OPTICS)。
但是,虽然我将紧密结合在一起的点聚集在一起,但我希望通过一些截止以保持预期的集群球形。另一方面,我不想过多地减少可到达区域,因为我错过了靠近参考点和核心点的点,但是一个小的差距会丢弃我希望包括的点。
下面显示下面的困境。Item 1
表示可达性应该如何降低以确保参考品脱周围的聚集点是球形的。虽然Item 2
显示了可到达区域如何需要更高以允许包含密集区域内的点。
我希望我可以调整一个参数或包含一个单独的功能而不是强制它。因为参考点周围的密集区域可能会有所不同,所以我不愿意强制排除特定半径之外的每个点。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import seaborn as sns
from sklearn.cluster import OPTICS
fig, ax = plt.subplots(figsize = (6,6))
ax.grid(False)
df = pd.DataFrame({
'Item' : [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2],
'x' : [-4.0,-1.0,0.5,0.0,0.0,2.0,3.0,5.0,10.0,-2.0,2.0,5.0,7.5,15.0,0.0,-22.0,-20.0,-20.0,-6.5,20.5,0.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0,-2.0,0.0,3.0,-3.0,-7.0,-7.5,-9.0,-4.0,1.5,-1.0,-5.0,-4.5,-3.7,15.0,-20.0,-22.0,-20.0,-20.0,-12.0,20.5,6.0,20.0,-20.0,-15.0,20.0,-15.0,-10.0],
'y' : [0.0,1.0,-0.5,0.5,-0.5,0.0,1.0,0.0,0.0,-2.0,-2.0,-7.0,-0.5,-10.5,-7.5,0.0,16.0,-15.0,5.0,13.5,3.0,-20.0,2.0,-17.5,-15,19.0,20.0,4.0,-2.0,0.0,0.0,2.5,2.0,-1.5,5.0,0.0,3.5,2.0,-5.5,-6.5,-10.5,-20.5,0.0,16.0,-15.0,5.0,13.5,6.0,-20.0,2.0,-17.5,-15,19.0,20.0],
'X_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0],
'Y_Ref' : [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0],
})
# not spherical
df = df[df['Item'] == 1]
# spherical but reachable area too small
#df = df[df['Item'] == 2]
df['distance'] = np.sqrt((df['X_Ref'] - df['x'])**2 + (df['Y_Ref'] - df['y'])**2)
Y_sklearn = df[['x','y']].values
ax.scatter(df['x'], df['y'], marker = 'o', s = 5)
ax.scatter(df['X_Ref'], df['Y_Ref'], c = 'w', edgecolor = 'k', marker = 'o', s = 7.5, zorder = 2)
#clusterer = DBSCAN(eps = 7.5, min_samples = 3)
#labels_clusters = clusterer.fit_predict(Y_sklearn)
clusterer = OPTICS(min_samples = 2, xi = 0.25, min_cluster_size = 0.25, max_eps = 5)
clusterer.fit(Y_sklearn)
labels_clusters = clusterer.fit_predict(Y_sklearn)
#Add cluster labels as a new column to original DataFrame.
df['cluster'] = labels_clusters
df['cluster'] = df['cluster'].astype('category')
sns.scatterplot(data = df,
x = 'x',
y = 'y',
hue = 'cluster',
ax = ax,
legend = 'full',
)
第 1 项:半径右侧的点应从核心点中排除