I am trying to accelerate the following bit of CUDA code by using multiple streams.
#define N (4096 * 4096)
#define blockDimX 16
#define blockDimY 16
float domain1 [N];
float domain2 [N];
__global__ updateDomain1_kernel(const int dimX, const int dimY) {
// update mechanism here for domain1
// ...
}
__global__ updateDomain2_kernel(const int dimX, const int dimY) {
// update mechanism here for domain2, which is nearly the same
// ...
}
__global__ addDomainsTogether_kernel(float* domainOut,
const int dimX,
const int dimY)
{
// add domain1 and domain2 together and fill domainOut
}
void updateDomains(float* domainOut) {
dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
dim3 threads(blockDimX, blockDimY);
updateDomain1_kernel<<<blocks, threads>>> (dimX, dimY);
updateDomain2_kernel<<<blocks, threads>>> (dimX, dimY);
addDomainsTogether_kernel<<<block, threads>>> (domainOut_gpu, dimX, dimY);
cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);
}
The precise implementation doesn't really matter; what's important is that updating the respective domains are two completely independent operations, after which both are used in the third kernel call. Hence I thought it a good idea to try to accelerate it by putting each update kernel in its own stream, which I want to run simultaneously. So I changed it to the following:
void updateDomains(float* domainOut) {
dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
dim3 threads(blockDimX, blockDimY);
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
updateDomain1_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);
updateDomain2_kernel<<<blocks, threads, 0, stream1>>> (dimX, dimY);
cudaDeviceSynchronize();
addDomainsTogether_kernel<<<block, threads>>> (domainOut_gpu, dimX, dimY);
cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaStreamDestroy(stream0);
cudaStreamDestroy(stream1);
}
I presumed to find a difference in performance speed, but there is absolutely no noticeable difference. So thinking that maybe the compiler was being smart the first time by automatically scheduling the update calls at the same time, I assumed that the following should slow down the performance:
void updateDomains(float* domainOut) {
dim3 blocks((dimX + blockDimX - 1) / blockDimX , (dimY + blockDimY- 1) / blockDimY);
dim3 threads(blockDimX, blockDimY);
cudaStream_t stream0;
cudaStreamCreate(&stream0);
updateDomain1_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);
updateDomain2_kernel<<<blocks, threads, 0, stream0>>> (dimX, dimY);
addDomainsTogether_kernel<<<block, threads0, stream0>>> (domainOut_gpu, dimX, dimY);
cudaMemcpy(domainOut, domainOut_gpu, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaStreamDestroy(stream0);
}
However, again there is hardly any difference in performance speed. If anything, the last one seems fastest. Which makes me think there is something about CUDA streams I do not understand. Can someone enlighten me on how accelerate this code?