您必须考虑定义与集群的接近度的正确指标是什么。基于 hclust 文档中的示例,这里有一种方法来计算每个集群的均值,然后测量新数据点与均值集之间的距离。
# Leave out one state
A <-USArrests
B <-A[rownames(A)!="Kentucky",]
KY <- A[rownames(A)=="Kentucky",]
# Put the B data into 10 clusters
hc <- hclust(dist(B), "ave")
memb <- cutree(hc, k = 10)
B$cluster = memb[rownames(B)==names(memb)]
# Compute the averages over the clusters
M <-aggregate( .~cluster, data=B, FUN=mean)
M$cluster=NULL
# Now add the hold out state to the set of averages
M <-rbind(M,KY)
# Compute the distance between the clusters and the hold out state.
# This is a pretty silly way to do this but it works.
D <- as.matrix(dist(as.matrix(M),diag=TRUE,upper=TRUE))["Kentucky",]
names(D) = rownames(M)
KYclust = which.min(D[-length(D)])
memb[memb==KYclust]
# Now cluster the full set of states and compare the results.
hc <- hclust(dist(A), "ave")
memb <- cutree(hc, k = 10)
a=memb[which(names(memb)=="Kentucky")]
memb[memb==a]