cuda - CUDA DE 内核未启动

Question

我正在尝试在 CUDA 上进行差异进化，但问题是负责“突变、交叉、评估、选择”的内核永远不会启动。

有什么帮助吗？

这是整个代码：

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[D*id+0]=a;
        mutation[D*id+1]=b;
        mutation[D*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

更新 - 解决方案：

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

score 1 · Accepted Answer

解决方案：

#include <iostream>
#include <curand_kernel.h>
using namespace std;

/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line )
{
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        system("pause");
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))


/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;


/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
                          + (a[1] - b[1]) * (a[1] - b[1])
                          + (a[2] - b[2]) * (a[2] - b[2]));
    float distance6 = distance * distance * distance
                      * distance * distance * distance;
    float distance12 = distance6 * distance6;
    return 1/distance12 - 2/distance6;
}

/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{
    curandState localState = globalState[id];
    float RANDOM = curand_uniform(&localState);
    globalState[id] = localState; 
    return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
        curand_init(seed, id, 0,&state[id]);
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id < NP)
    {
        //init, just populating array with some specific numbers
        population[D*id]=0;
        population[D*id+N]=0;
        population[D*id +2*N]=0;

        population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
        population[D*id+N+1]=0;
        population[D*id +2*N+1]=0;

        for(int i=2; i<N; i++){
            float min= -4 - 1/4*abs((int)((i-4)/3));
            float max= 4 + 1/4*abs((int)((i-4)/3));
            if(i==2)
            {
                population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
                population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+2]=0;
            }
            else
            {
                population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
                population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
            }
        }

        //eval
        float e=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
                e += lennardJones(a,b);
            }
        }
        population[D*id + D-1]=e;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;
    if(id<NP)
    {
        int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
        while(a == id){a = rndInt(globalState, id, NP);}
        while(b == a && b==id){b=rndInt(globalState, id, NP);}
        while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
        mutation[3*id+0]=a;
        mutation[3*id+1]=b;
        mutation[3*id+2]=c;
    }
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
    int id= threadIdx.x+blockIdx.x*blockDim.x;  
    if(id<NP)
    {
        int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];

        //DE mutation and crossover
        int j=rndInt(globalState, id, NP);
        for(int i=0; i<D-1; i++)
        {
            //DE mutation
            pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
            //DE crossover
            if(Cr > rndFloat(globalState, id) && i!= j)
                pop[D*id+i]=population[D*id +i];
        }
        // Eval
        pop[D*id+D-1]=0;
        for(int i=0; i<N; i++)
        {
            for(int j=0; j<i; j++)
            {
                float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
                pop[D*id+D-1] += lennardJones(a,b);
            }
        }

        __syncthreads();
        //DE selection
        if(pop[D*id+D-1] < population[D*id +D-1])
        {
            for(int i=0; i<D; i++)
                population[D*id +i]=pop[D*id+i];
        }
    }
}

void getBestScore(float *hpopulation)
{
    int max=0;
    for(int i=1; i<hNP; i++)
    {
        if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
            max=i;
    }
    for(int j=0; j<hN; j++)
        cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}

int main()
{
    cudaEvent_t start,stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));  
    HANDLE_ERROR(cudaEventRecord(start,0));

    int device, st=100;
    float hCr=0.6f, hF=0.8f;
    cudaDeviceProp prop;
    HANDLE_ERROR(cudaGetDevice(&device));
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
//  int SN = prop.maxThreadsPerBlock; //512 threads per block
    //int SB = (hNP+(SN-1))/SN;


    //constants NP, D, N, Cr, F
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));

    //seeds
    curandState* devStates;
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
    setup_kernel <<< 1, hNP>>> (devStates, 50);

    //population
    float *population, *pop;
    float hpopulation[hNP*hD];
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));

    //mutation
    int *mutation, *mutation1;
    int *hmutation;
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));

    //stream
    cudaStream_t stream_i, stream_j;
    HANDLE_ERROR(cudaStreamCreate(&stream_i));
    HANDLE_ERROR(cudaStreamCreate(&stream_j));

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);


    while(st != 0)
    {
        /*** COPYING MUTATION INDICES***/
        HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
        HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));

        /**** CALLING KERNELS****/
        kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
        kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);

        st--;

        //HANDLE_ERROR(cudaStreamSynchronize(stream_i));
        //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
        //getBestScore(hpopulation);
        //cin.get();
    }

    HANDLE_ERROR(cudaStreamSynchronize(stream_i));
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
    getBestScore(hpopulation);

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float time;
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;

    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaStreamDestroy(stream_i));
    HANDLE_ERROR(cudaStreamDestroy(stream_j));
    HANDLE_ERROR(cudaFree(population));
    HANDLE_ERROR(cudaFree(pop));
    HANDLE_ERROR(cudaFreeHost(hmutation));
    HANDLE_ERROR(cudaFree(mutation1));
    HANDLE_ERROR(cudaFree(devStates));

    system("pause");
    return 0;
}

cuda - CUDA DE 内核未启动

1 回答 1

Related

Reference