对于我的任务,我必须对 600 个数据点应用 K-means 算法,这些数据点都有 60 个维度(或属性,如果你愿意的话)。我将 600 个数据点存储到 6 个集群中(所以 K=6),这就是我到目前为止所拥有的:
#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <cmath> //std::cmath
#include <array> //std::array
#define K 6
#define MAX_ITERATIONS 10000000
#define NUM_ATTRIBUTES 60
struct Point{
std::array<double, NUM_ATTRIBUTES> point;
std::string classType;
};
struct Cluster{
std::vector<Point> points;
Point centroid;
};
int randNumGenerator(int max){
int num = (rand() % max);
return num;
}
void setData(Point p, std::string line, std::vector<Point> &data, int index){
std::stringstream s(line);
std::string classes[] = {"Normal", "Cyclic", "Increasing trend",
"Decreasing trend", "Upward shift", "Downward shift"};
for(int i = 0; i < NUM_ATTRIBUTES; i++){
double num;
if(s >> num){
p.point[i] = num;
}
}
if(index > 0 && index <= 100){
p.classType = classes[0];
}
else if(index > 100 && index <= 200){
p.classType = classes[1];
}
else if(index > 200 && index <= 300){
p.classType = classes[2];
}
else if(index > 300 && index <= 400){
p.classType = classes[3];
}
else if(index > 400 && index <= 500){
p.classType = classes[4];
}
else if(index > 500 && index <= 600){
p.classType = classes[5];
}
data.push_back(p);
}
void initializeCentroids(std::vector<Point> &points, int num_clusters,
std::vector<Point> ¢roids){
Point p;
std::vector<bool> numsUsedAlready(points.size());
for(int i = 0; i < num_clusters; i++){
int randNum = randNumGenerator(points.size());
while(numsUsedAlready[randNum]){
randNum = randNumGenerator(points.size());
}
numsUsedAlready[randNum] = true;
p = points[randNum];
centroids.push_back(p);
}
}
double calculateDistance(Point p, Point centroid){
double ret = 0;
for(int i = 0; i < p.point.size(); i++){
double distance = p.point[i] - centroid.point[i];
ret += distance * distance;
}
return sqrt(ret);
}
void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){
for(int i = 0; i < centroids.size(); i++){
Cluster c;
c.points.push_back(centroids[i]);
c.centroid = centroids[i];
clusters.push_back(c);
}
}
void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
int sendToCluster = 99999999;
for(int index = 0; index < points.size(); index++){
double minDist = 99999999999;
Point p = points[index];
for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
//std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
if(tempDist < minDist){
minDist = tempDist;
sendToCluster = clusterNum;
}
}
//std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl;
clusters[sendToCluster].points.push_back(p);
}
}
void updateCentroid(std::vector<Cluster> &clusters){
for(int i = 0; i < clusters.size(); i++){
Cluster c = clusters[i];
for(int j = 0; j < NUM_ATTRIBUTES; j++){
double avg = 0;
for(int h = 0; h < c.points.size(); h++){
Point p = c.points[h];
avg += p.point[j];
}
double oldCentroidValue = c.centroid.point[j];
c.centroid.point[j] = avg / c.points.size();
std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl;
}
}
}
void k_clustering(std::vector<Point> &points, int num_clusters){
std::vector<Point> centroids;
initializeCentroids(points, num_clusters, centroids);
std::vector<Cluster> clusters;
setCentroids(clusters, centroids);
/*for(int i = 0; i < clusters.size(); i++){
for(int j = 0; j < NUM_ATTRIBUTES; j++){
std::cout << clusters[i].centroid.point[j] << " ";
}
std::cout << std::endl;
}
*/
setClusters(clusters, points);
updateCentroid(clusters);
for(int i = 0; i < clusters.size(); i++){
std::cout << i << " " << clusters[i].points.size() << std::endl;
}
}
void readInputFile(std::ifstream &file, std::vector<Point> &data){
std::string line;
int counter = 0;
while(getline(file,line)){
counter++;
Point p;
setData(p, line, data, counter);
}
}
void usageString(){
std::cout << "Usage: output <input_file>" << std::endl;
}
char* checkFile(int argc, char *argv[]){
if(argc < 2){
usageString();
exit(0);
}
else{
char *inputfile = argv[1];
return inputfile;
}
}
int main(int argc, char *argv[]){
char *inputfile;
inputfile = checkFile(argc, argv);
std::ifstream input(inputfile);
if(!input.is_open()){
std::cerr << "Error: Data file doesn't exist" << std::endl;
return EXIT_FAILURE;
}
srand(time(NULL));
std::vector<Point> data;
readInputFile(input, data);
k_clustering(data, K);
//printData(data);
//Attribute closest cluster to each data point
//For each point, calculate distance to each centroid
//Assign point to cluster (and update centroid value, by finding mean values of all points)
//Repeat until nothing changed or after a certain number of iterations
return 1;
}
但是,我注意到我的代码仅适用于一次迭代。但是在第一次迭代之后,在我的“setClusters”函数中,我正在推进一个点。我必须将该点移动到另一个集群(如果需要),但是我很困惑如何做到这一点,而不需要从该集群中删除该点然后将其推送到另一个集群。我确信有一种更有效的方法可以将 Point 换出到不同的集群。