random - 使用 Curand 和 CUDA 内核同时初始化许多具有随机数的数组

Question

我正在尝试在 GPU 上同时使用随机生成的数字初始化每个这些并行数组的 100 个元素。但是，我的例程不会产生各种随机数。当我在 Visual Studio 中调试代码时，我看到数组中的每个元素都有一个数字。此代码的目的是优化 CImg FilledTriangles 例程以尽可能使用 GPU。

我做错了什么，我该如何解决？这是我的代码：

__global__ void initCurand(curandState* state, unsigned long seed)
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
    __syncthreads();
}

/*
 * CUDA kernel that will execute 100 threads in parallel
*/

__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity
                                ,float * angle, unsigned char** color, int height, int width, curandState* state){

    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    curandState localState = state[idx];
    __syncthreads();

    posx[idx] = (float)(curand_uniform(&localState)*width);
    posy[idx] = (float)(curand_uniform(&localState)*height);
    rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
    angle[idx] = (float)(curand_uniform(&localState)*360);
    veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
    color[idx][0] = (unsigned char)(curand_uniform(&localState)*255);
    color[idx][1] = (unsigned char)(curand_uniform(&localState)*255);
    color[idx][2] = (unsigned char)(curand_uniform(&localState)*255);
    opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}

这是准备和调用这些内核的主机代码：我试图在网格的一个块上创建 100 个线程（针对每个元素）。

 // launch grid of threads
      dim3 dimBlock(100);
      dim3 dimGrid(1);

      initCurand<<<dimBlock,dimGrid>>>(devState, unsigned(time(nullptr)));
      // synchronize the device and the host
    cudaDeviceSynchronize();
     initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState);

预赛：

  // Define random properties (pos, size, colors, ..) for all triangles that will be displayed.
    float posx[100], posy[100], rayon[100], angle[100], veloc[100], opacity[100];
    // Define the same properties but for the device
    float* d_posx;
    float* d_posy;
    float* d_rayon;
    float* d_angle;
    float* d_veloc;
    float* d_opacity;
    //unsigned char d_color[100][3];
    unsigned char** d_color;
    curandState* devState;
    cudaError_t err;

    // allocate memory on the device for the device arrays
    err = cudaMalloc((void**)&d_posx, 100 * sizeof(float));
    err = cudaMalloc((void**)&d_posy, 100 * sizeof(float));
    err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float));
    err = cudaMalloc((void**)&d_angle, 100 * sizeof(float));
    err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float));
    err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float));
    err = cudaMalloc((void**)&devState, 100*sizeof(curandState));
    errCheck(err);
    size_t pitch;
    //allocated the device memory for source array  
    err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100);

得到结果：

// get the populated arrays back to the host for use
     err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost);
     err = cudaMemcpy2D(color,pitch,d_color,100, 100 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost);

score 1 · Accepted Answer

绝对您需要对此进行更改：

err = cudaMalloc((void**)&devState, 100*sizeof(float));

对此：

err = cudaMalloc((void**)&devState, 100*sizeof(curandState));

如果你通过 cuda-memcheck 运行你的代码，你就会发现这一点。因此，您的 initCurand 内核有很多越界访问。

您还应该对所有 cuda 调用和所有内核启动进行错误检查。color[][]我相信由于阵列上的操作混乱，您的第二个内核调用失败。

通常当我们用来创建一个数组时cudaMallocPitch，我们需要使用 pitch 参数来访问它。C 双下标数组本身是行不通的，因为 C 对实际数组宽度没有固有的了解。

我能够通过进行以下更改来修复它：

__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, float* opacity,float * angle, unsigned char* color, int height, int width, curandState* state, size_t pitch){

    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    curandState localState = state[idx];
    __syncthreads();

    posx[idx] = (float)(curand_uniform(&localState)*width);
    posy[idx] = (float)(curand_uniform(&localState)*height);
    rayon[idx] = (float)(10 + curand_uniform(&localState)*50);
    angle[idx] = (float)(curand_uniform(&localState)*360);
    veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
    color[idx*pitch] = (unsigned char)(curand_uniform(&localState)*255);
    color[(idx*pitch)+1] = (unsigned char)(curand_uniform(&localState)*255);
    color[(idx*pitch)+2] = (unsigned char)(curand_uniform(&localState)*255);
    opacity[idx] = (float)(0.3 + 1.5*curand_uniform(&localState));
}

和

 initializeArrays<<<dimBlock, dimGrid>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,d_color, img0.height(), img0.width(), devState, pitch);

和

unsigned char* d_color;

通过这些更改，我能够消除我发现的错误，并且代码会吐出各种随机值。我还没有检查所有的值，但这应该让你开始。

random - 使用 Curand 和 CUDA 内核同时初始化许多具有随机数的数组

1 回答 1

Related

Reference