1

I am trying to accelerate the following bit of CUDA code by using multiple streams.

#define N (4096 * 4096)
#define blockDimX  16
#define blockDimY  16

float domain1 [N];
float domain2 [N];

__global__ updateDomain1_kernel(const int dimX, const int dimY) {
    // update mechanism here for domain1
    // ...
}

__global__ updateDomain2_kernel(const int dimX, const int dimY) {
    // update mechanism here for domain2, which is nearly the same
    // ...
}

__global__ addDomainsTogether_kernel(float* domainOut, 
                                     const int dimX, 
                                     const int dimY) 
{
    // add domain1 and domain2 together and fill domainOut
}

void updateDomains(float* domainOut) {
    dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
    dim3 threads(blockDimX, blockDimY);

    updateDomain1_kernel<<<blocks, threads>>> (dimX, dimY);
    updateDomain2_kernel<<<blocks, threads>>> (dimX, dimY);
    addDomainsTogether_kernel<<<block, threads>>> (domainOut_gpu, dimX, dimY);
    cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);
}

The precise implementation doesn't really matter; what's important is that updating the respective domains are two completely independent operations, after which both are used in the third kernel call. Hence I thought it a good idea to try to accelerate it by putting each update kernel in its own stream, which I want to run simultaneously. So I changed it to the following:

void updateDomains(float* domainOut) {
    dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
    dim3 threads(blockDimX, blockDimY);

    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);

    updateDomain1_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);
    updateDomain2_kernel<<<blocks, threads, 0, stream1>>> (dimX, dimY);
    cudaDeviceSynchronize();

    addDomainsTogether_kernel<<<block, threads>>> (domainOut_gpu, dimX, dimY);
    cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);

    cudaStreamDestroy(stream0);
    cudaStreamDestroy(stream1);
}

I presumed to find a difference in performance speed, but there is absolutely no noticeable difference. So thinking that maybe the compiler was being smart the first time by automatically scheduling the update calls at the same time, I assumed that the following should slow down the performance:

void updateDomains(float* domainOut) {
    dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
    dim3 threads(blockDimX, blockDimY);

    cudaStream_t stream0;
    cudaStreamCreate(&stream0);

    updateDomain1_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);
    updateDomain2_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);

    addDomainsTogether_kernel<<<block, threads0, stream0>>> (domainOut_gpu, dimX, dimY);
    cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);

    cudaStreamDestroy(stream0);
}

However, again there is hardly any difference in performance speed. If anything, the last one seems fastest. Which makes me think there is something about CUDA streams I do not understand. Can someone enlighten me on how accelerate this code?

4

1 回答 1

1

如果您还没有使用所有可用的内核,则增加的并行性只会增加您的计算吞吐量。如果您已经拥有足够的并行度,那么除了增加同步开销之外,它对您无济于事。

于 2013-03-04T19:24:05.600 回答