cuda - 我使用 cudaEvent 来测量我的代码的运行时间。但我发现了一些谜题

Question

我使用以下代码来测量我的代码的运行时间；

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>

#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;

const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));



struct DataType
{
    float d[dim];
};





void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{



    fstream in(dir_.c_str(),ios::in);
    string tline;
    string::size_type position;
    getline(in,tline);

    int flag = atoi(tline.c_str());
    if(flag != 1)
    {
        cout<<"there is problem in file : "<<dir_<<endl;
        exit(-1);
    }

    getline(in,tline);
    int tot = atoi(tline.c_str());

    getline(in,tline);




    for(int i = 0; i < dim - 1; i++)
    {
        position = tline.find(" ");
        gt.d[i] = atof(tline.substr(0,position).c_str());
        tline = tline.substr(position+1, tline.size() - position);
    }
    gt.d[dim-1] = atof(tline.c_str());

    DataType dt;
    for(int i = 0; i < tot-1; i++)
    {
        getline(in,tline);
        for(int i = 0; i < dim - 1; i++)
        {
            position = tline.find(" ");
            dt.d[i] = atof(tline.substr(0,position).c_str());
            tline = tline.substr(position+1, tline.size() - position);
        }
        dt.d[dim-1] = atof(tline.c_str());
        D_.push_back(dt);
    }
}



__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
    int index = blockIdx.x;
    if(index < N)
    {
        gpu_Mchanged1[index] = index;
        gpu_Schanged1[index] = index;
    }

}


//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{

    int index = blockIdx.x;
    int dis = 0;

    int ii = 0;
    int i0 = 0;

    int inlierNum = 0;
    //  double h2 = 10000;

    if(index < tot)
    {
        dis = 0;
        i0 = gpu_Schanged1[index];

        for(unsigned int i = 0; i < dim; i++)
            msPoint[index].d[i] = 0;

        for(int i = 0 ;i < tot ;i++)
        {
            ii = gpu_Schanged1[i];


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
            {
                dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
                if(dis > pow(double(h),2.0))
                    break;
            }

            if (dis < pow(double(h),2.0))
            {
                inlierNum++;
                for(unsigned int j = 0; j < dim; j++)
                    msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
            }
        }


        //      msPoint[index].d[0] = inlierNum;
        for(unsigned int j = 0; j < dim; j++)
        {
            msPoint[index].d[j] /= inlierNum;
            msPoint[index].d[j] += input[i0].d[j];
        }

    }
}


//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = 1000000;
    int flag = -1;
    int i1;
    if(index < tot)
    {

        for(int i = 0; i < tot; i++)
        {
            i1 = gpu_Schanged1[i];

            dis = 0;
            for(int j = 0; j < dim; j++)
                dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);

            if(dis <= disMin)
            {
                disMin = dis;
                flag = i1;
            }
        }
        Sunchanged[gpu_Schanged1[index]] = index;
        Munchanged[gpu_Schanged1[index]] = flag;
        gpu_Mchanged1[index] = flag;
    }
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
    int index = blockIdx.x;
    bool find2 = false;

    if(index < tot)
    {
        int lastOne = -1;
        int thisOne = -1;
        int indexIter = index;
        while(1)
        {



            if(loop1st)
            {
                lastOne = gpu_Mchanged1[indexIter];
                thisOne = gpu_Mchanged1[lastOne];

                if(lastOne == thisOne)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
                    gpu_Mchanged1[index] = thisOne;
                    break;
                }
                indexIter = thisOne;
            }

            else
            {
                //              gpu_Mchanged1[index] = gpu_Schanged1[index];

                while(1)
                {
                    lastOne = gpu_Mchanged1[indexIter];
                    for(int i = 0; i < tot; i++)
                    {
                        if(i == indexIter)
                            continue;

                        if(lastOne == gpu_Schanged1[i])
                        {
                            thisOne = i;
                            find2 = true;
                            break;
                        }
                    }
                    if(find2 == false)
                        break;
                    indexIter = thisOne;
                    find2 = false;

                }
                if(thisOne != index && thisOne != -1)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
                    gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
                }
                break;
            }
        }
    }

}
//

__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
    int index = blockIdx.x;
    int router0, router1;
    if(index < N)
    {
        if(loop1st)
        {
            router0 = gpu_Mchanged1[index];
            router1 = gpu_Mchanged1[router0];
            while(1)
            {

                if(index == router0 || index == router1)
                {
                    gpu_Munchanged[index] = index;
                    break;
                }
                if(router0 == router1)
                    break;
                router0 = gpu_Mchanged1[router1];
                router1 = gpu_Mchanged1[router0];
            }
        }

    }


}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
    int index = blockIdx.x;
    if(index < smallTot)
    {
        int count = 0;
        for(unsigned int i = 0; i < tot; i++)
        {
            if(gpu_Munchanged[i] == gpu_Mchanged1[index])
                count++;
        }
        gpu_clusterSize[index] = count;
        if(count <= MINCOUNTS)
            gpu_Schanged1[index] = -1;
    }

}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
    int index = blockIdx.x;
    if(index < tot)
    {
        if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
        {
            gpu_Sunchanged[index] = -1;
        }
    }


}

struct increaseOne: public thrust::unary_function<int, int>
{
    int operator()(int a_){return a_++;}

};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = pow(double(h/2),2.0);
    int disMinIndex = -1;
    bool flag = false;
    if(index < tot)
    {
        for(unsigned int i = 0; i < tot; i++)
        {
            if(index == i)
                continue;


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
                dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
            //          dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);

            if(dis < disMin)
            {
                flag = true;
                disMin = dis;
                disMinIndex = i;
            }
        }

        if(flag)
            if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
            {
                gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
                gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];

            }
    }
}



struct is_minus_one
{
    __host__ __device__
    bool operator()(const int x)
    {
        return(x == -1);
    }
};

typedef thrust::device_vector<int>::iterator dintiter;

int main(int argc, char** argv)
{
    //  int h = 100;
    using namespace std;
    thrust::host_vector<DataType> host_input;
    //  string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
    string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
    DataType gt;
    loadData(host_input,dir, gt);
    cudaEvent_t start,stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int loopTime = 100;
    float timeSum = 0;

    thrust::device_vector<DataType> device_input = host_input;  // Host端vector
    int N = device_input.size();
    int rN = N;
    int lastSize, thisSize;

    DataType *gpu_input;
    gpu_input = thrust::raw_pointer_cast(&device_input[0]);

    thrust::device_vector<DataType> device_msPoint;
    device_msPoint.resize(N);
    DataType *gpu_msPoint;

    thrust::device_vector<int> device_Sunchanged;
    device_Sunchanged.resize(N);
    int *gpu_Sunchanged;
    gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);

    thrust::device_vector<int> device_Munchanged;
    device_Munchanged.resize(N);
    int *gpu_Munchanged;
    gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);

    thrust::device_vector<int> device_Schanged1;
    device_Schanged1.resize(N);
    int *gpu_Schanged1;
    gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);

    thrust::device_vector<int> device_Mchanged1;
    device_Mchanged1.resize(N);
    int *gpu_Mchanged1;
    gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);

    thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;

    thrust::device_vector<int> device_clusterSize;

    initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);

    bool loop1st = true;
    dintiter Mend, Send, Cend;
    int *gpu_clusterSize;
    gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);




    for(int i = 0; i < loopTime; i++)
    {

        cudaFree(0);
        cudaEventRecord(start,0);


        while(1)
        {
            lastSize = device_Schanged1.size();
            N = lastSize;
            device_msPoint.resize(N);

            parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
            device_Mchanged1.resize(N);
            nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);

            routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);


            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            //
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N = new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);

            device_clusterSize.clear();
            device_clusterSize.resize(N);

            gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
            checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);

            Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
            Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
            Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);

            N =  Send - device_Schanged1.begin();
            device_Schanged1.resize(N);
            device_Mchanged1.resize(N);
            device_clusterSize.resize(N);
            mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N =  new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);


            thisSize = N;
            if(lastSize == thisSize)
                break;
            loop1st = false;

            thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
            device_Mchanged1.clear();
            gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
        }
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        //      for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
        //          cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;

        timeSum += time;
        cout<<i<<" "<<time<<endl;
    }
    cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;









    return 0;


}

每个循环中变量时间的输出都不相同，这是我得到的结果：

但是每次我测试的代码都做同样的事情。我应该相信哪个结果？我对此感到非常困惑。谢谢。

score 1 · Accepted Answer

既然您已经发布了实际代码，我看不出您的计时方法有任何明显的问题。关于cudaFree(0);和启动时间的陈述无关紧要，因为您的代码在您的第一个时序之前就已经创建了一个 cuda 上下文。

我无法运行您的代码，因为它取决于我没有的数据文件。然而，对时间变化最可能的解释是时间或正在完成的工作存在实际变化。即使您似乎在运行相同的代码，但每次运行所需的时间可能不同。

一些可能的例子（我并不是说你的代码是这样的；我不知道）：

如果序列已经排序，则推力::sort_by_key 将花费不同的时间，而不是未排序的序列。由于您正在进行就地排序，因此出现的问题是您是否正在对已经排序的数据进行排序。第一次通过计时循环可能会对数据进行排序，而随后的通过可能会对已经排序的数据进行排序，这将花费更少的时间。
另一个例子是您的所有.resize(N)操作。在我看来，第一次通过循环时，这些可能正在做一些实际的调整大小，而在随后的通过中，如果N没有改变，那么就没有实际的调整大小，因此操作花费的时间更少。

同样，我不知道这些假设是否适用于您的代码，我只是指出在某些情况下重复运行相同的代码序列并观察时序变化是如何可能的。

显然，由于代码是相同的，因此问题就变成了分析数据以查看每次运行是否相同。一个有趣的测试可能是在遇到指令之前跟踪通过while(1)循环的次数。break;这也可能对正在发生的事情具有指导意义。

cuda - 我使用 cudaEvent 来测量我的代码的运行时间。但我发现了一些谜题

1 回答 1

Related

Reference