0

对于我的任务,我必须对 600 个数据点应用 K-means 算法,这些数据点都有 60 个维度(或属性,如果你愿意的话)。我将 600 个数据点存储到 6 个集群中(所以 K=6),这就是我到目前为止所拥有的:

#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <cmath> //std::cmath
#include <array> //std::array

#define K 6
#define MAX_ITERATIONS 10000000
#define NUM_ATTRIBUTES 60

struct Point{
        std::array<double, NUM_ATTRIBUTES> point;
        std::string classType;
};

struct Cluster{
        std::vector<Point> points;
        Point centroid;
};

int randNumGenerator(int max){
        int num = (rand() % max);
        return num;
}

void setData(Point p, std::string line, std::vector<Point> &data, int index){
        std::stringstream s(line);
        std::string classes[] = {"Normal", "Cyclic", "Increasing trend", 
                                "Decreasing trend", "Upward shift", "Downward shift"};

        for(int i = 0; i < NUM_ATTRIBUTES; i++){
                double num;
                if(s >> num){
                        p.point[i] = num;
                }
        }

        if(index > 0 && index <= 100){
                p.classType = classes[0];
        }
        else if(index > 100 && index <= 200){
                p.classType = classes[1];
        }
        else if(index > 200 && index <= 300){
                p.classType = classes[2];
        }
        else if(index > 300 && index <= 400){
                p.classType = classes[3];
        }
        else if(index > 400 && index <= 500){
                p.classType = classes[4];
        }
        else if(index > 500 && index <= 600){
                p.classType = classes[5];
        }
        data.push_back(p);
}

void initializeCentroids(std::vector<Point> &points, int num_clusters,
                                std::vector<Point> &centroids){
        Point p;
        std::vector<bool> numsUsedAlready(points.size());
        for(int i = 0; i < num_clusters; i++){
                int randNum = randNumGenerator(points.size());
                while(numsUsedAlready[randNum]){
                        randNum = randNumGenerator(points.size());
                }
                numsUsedAlready[randNum] = true;
                p = points[randNum];
                centroids.push_back(p);
        }
}

double calculateDistance(Point p, Point centroid){
        double ret = 0;
        for(int i = 0; i < p.point.size(); i++){
                double distance = p.point[i] - centroid.point[i];
                ret += distance * distance;
        }
        return sqrt(ret);
}

void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){
        for(int i = 0; i < centroids.size(); i++){
                Cluster c;
                c.points.push_back(centroids[i]);
                c.centroid = centroids[i];
                clusters.push_back(c);
        }
}

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                Point p = points[index];
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].points.push_back(p);
        }
}

void updateCentroid(std::vector<Cluster> &clusters){
        for(int i = 0; i < clusters.size(); i++){
                Cluster c = clusters[i];
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        double avg = 0;
                        for(int h = 0; h < c.points.size(); h++){
                                Point p = c.points[h];
                                avg += p.point[j];
                        }
                        double oldCentroidValue = c.centroid.point[j];
                        c.centroid.point[j] = avg / c.points.size();
                        std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl;
                }
        }
}

void k_clustering(std::vector<Point> &points, int num_clusters){
        std::vector<Point> centroids;
        initializeCentroids(points, num_clusters, centroids);
        std::vector<Cluster> clusters;
        setCentroids(clusters, centroids);

        /*for(int i = 0; i < clusters.size(); i++){
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        std::cout << clusters[i].centroid.point[j] << " ";
                }
                std::cout << std::endl;
        }
        */

        setClusters(clusters, points);
        updateCentroid(clusters);
        for(int i = 0; i < clusters.size(); i++){
                std::cout << i << " " << clusters[i].points.size() << std::endl;
        }
}

void readInputFile(std::ifstream &file, std::vector<Point> &data){
        std::string line;
        int counter = 0;
        while(getline(file,line)){
                counter++;
                Point p;
                setData(p, line, data, counter);
        }
}

void usageString(){
        std::cout << "Usage: output <input_file>" << std::endl;
}

char* checkFile(int argc, char *argv[]){
        if(argc < 2){
                usageString();
                exit(0);
        }
        else{
                char *inputfile = argv[1];
                return inputfile;
        }
}

int main(int argc, char *argv[]){
        char *inputfile;
        inputfile = checkFile(argc, argv);

        std::ifstream input(inputfile);
        if(!input.is_open()){
                std::cerr << "Error: Data file doesn't exist" << std::endl;
                return EXIT_FAILURE;
        }
        srand(time(NULL));
        std::vector<Point> data;
        readInputFile(input, data);
        k_clustering(data, K);
        //printData(data);
        //Attribute closest cluster to each data point
          //For each point, calculate distance to each centroid
            //Assign point to cluster (and update centroid value, by finding mean values of all points)
          //Repeat until nothing changed or after a certain number of iterations

        return 1;
}

但是,我注意到我的代码仅适用于一次迭代。但是在第一次迭代之后,在我的“setClusters”函数中,我正在推进一个点。我必须将该点移动到另一个集群(如果需要),但是我很困惑如何做到这一点,而不需要从该集群中删除该点然后将其推送到另一个集群。我确信有一种更有效的方法可以将 Point 换出到不同的集群。

4

1 回答 1

0

我没有时间浏览整个代码,但是如果您对优化感兴趣,setClusters()那么也许您可以尝试 C++ 的移动语义和vector.emplace_back()

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                Point& p = points[index];  // use a reference to avoid copying, unless you don't want to alter the original "points" vector
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].points
                                       .emplace_back(    // construct the new item in place
                                           std::move(p)  // by "moving" from p
                                        );
        }
        points.clear();  // clear the input vector, unless you don't want to alter it. cf. the comment on using references
}

另一个想法是让Clusters 持有“指数”而不是实际的“点”。在这种情况下,您可以将结构声明为:

struct Cluster{
        std::vector<int> pointIndices;
        Point centroid;
};

然后,在 中setClusters(),您可以只推送索引:

void setClusters(std::vector<Cluster> &clusters, const std::vector<Point> points){  // points can be const now
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                const Point& p = points[index];  // use a reference to avoid copying
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].pointIndices
                                       .push_back(index);  // cheap compared to pushing a "Point"
        }
}

我希望它有所帮助。

于 2015-12-02T07:09:02.667 回答