下面是一个小测试台,用于查看
scipy.spatial.cKDTree
在您的数据上的速度,并大致了解附近点之间的距离如何分散。
为各种 K 运行 K-cluster 的一个好方法是构建最近对的 MST,并删除最长的 K-1;见
韦恩,贪心算法。
可视化集群会很有趣——使用 PCA 投影到 2d?
(只是好奇,你的 K 是 10、100、1000 吗?)
12 月 17 日添加:实际运行时间:100000 x 5 10 秒、500000 x 5 60 秒
#!/usr/bin/env python
# time scipy.spatial.cKDTree build, query
from __future__ import division
import random
import sys
import time
import numpy as np
from scipy.spatial import cKDTree as KDTree
# http://docs.scipy.org/doc/scipy/reference/spatial.html
# $scipy/spatial/kdtree.py is slow but clean, 0.9 has cython
__date__ = "2010-12-17 dec denis"
def clumpiness( X, nbin=10 ):
""" how clumpy is X ? histogramdd av, max """
# effect on kdtree time ? not much
N, dim = X.shape
histo = np.histogramdd( X, nbin )[0] .astype(int) # 10^dim
n0 = histo.size - histo.astype(bool).sum() # uniform: 1/e^lambda
print "clumpiness: %d of %d^%d data bins are empty av %.2g max %d" % (
n0, nbin, dim, histo.mean(), histo.max())
#...............................................................................
N = 100000
nask = 0 # 0: ask all N
dim = 5
rnormal = .9
# KDtree params --
nnear = 2 # k=nnear+1, self
leafsize = 10
eps = 1 # approximate nearest, dist <= (1 + eps) * true nearest
seed = 1
exec "\n".join( sys.argv[1:] ) # run this.py N= ...
np.random.seed(seed)
np.set_printoptions( 2, threshold=200, suppress=True ) # .2f
nask = nask or N
print "\nkdtree: dim=%d N=%d nask=%d nnear=%d rnormal=%.2g leafsize=%d eps=%.2g" % (
dim, N, nask, nnear, rnormal, leafsize, eps)
if rnormal > 0: # normal point cloud, .9 => many near 1 1 1 axis
cov = rnormal * np.ones((dim,dim)) + (1 - rnormal) * np.eye(dim)
data = np.abs( np.random.multivariate_normal( np.zeros(dim), cov, N )) % 1
# % 1: wrap to unit cube
else:
data = np.random.uniform( size=(N,dim) )
clumpiness(data)
ask = data if nask == N else random.sample( data, sample )
t = time.time()
#...............................................................................
datatree = KDTree( data, leafsize=leafsize ) # build the tree
print "%.1f sec to build KDtree of %d points" % (time.time() - t, N)
t = time.time()
distances, ix = datatree.query( ask, k=nnear+1, eps=eps )
print "%.1f sec to query %d points" % (time.time() - t, nask)
distances = distances[:,1:] # [:,0] is all 0, point to itself
avdist = distances.mean( axis=0 )
maxdist = distances.max( axis=0 )
print "distances to %d nearest: av" % nnear, avdist, "max", maxdist
# kdtree: dim=5 N=100000 nask=100000 nnear=2 rnormal=0.9 leafsize=10 eps=1
# clumpiness: 42847 of 10^5 data bins are empty av 1 max 21
# 0.4 sec to build KDtree of 100000 points
# 10.1 sec to query 100000 points
# distances to 2 nearest: av [ 0.07 0.08] max [ 0.15 0.18]
# kdtree: dim=5 N=500000 nask=500000 nnear=2 rnormal=0.9 leafsize=10 eps=1
# clumpiness: 2562 of 10^5 data bins are empty av 5 max 80
# 2.5 sec to build KDtree of 500000 points
# 60.1 sec to query 500000 points
# distances to 2 nearest: av [ 0.05 0.06] max [ 0.13 0.13]
# run: 17 Dec 2010 15:23 mac 10.4.11 ppc