cuda - CUDA 中的动态并行性不起作用

Question

我写了一个简单的代码来理解动态并行。从打印的值中，我看到子内核已正确执行，但是当我回到父内核时，我看到使用了错误的值来代替在子内核中正确更新的临时数组。当我尝试更新“d_cin 数组”时，它给了我错误的值。这些是正在使用的编译标志：

nvcc -m64 -dc  -gencode arch=compute_35,code=sm_35  -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35  -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt

有人能帮我吗？这是代码。

#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"

__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];

}

__global__ void kernel(double *d_I,double *d_temp,double *d_cin){

    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
        if(index<30){

    cudaStream_t s;
        cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
    dim3 dimBlock(3,1,1);
    dim3 dimGrid(1,1,1);
    innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
        __syncthreads();

    if(index==0){
        for(i=0;i<90;i++)
            cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
    }   
    for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
    }
        __syncthreads();
    d_cin[index]=res;
        cudaStreamDestroy(s);
    }
}

int main(int argc,char **argv){

    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];

    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);

    cudaMalloc(&d_temp,3*30*sizeof(double));

    cudaMalloc(&d_cin,30*sizeof(double));

    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    cudaPrintfInit();
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaPrintfDisplay(stdout,true);
    cudaPrintfEnd();
    printf("After the kernel\n");
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);

    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);

}

score 2 · Accepted Answer

每当您遇到 CUDA 代码问题时，我建议您进行适当的 cuda 错误检查（您也可以在内核代码中使用这种方法来实现动态并行）。您也可以尝试使用cuda-memcheck. 最后，cuPrintf没有必要。任何 cc 3.5 GPU 都printf直接从内核代码支持。

您的代码中的问题是内核启动（包括子内核启动）是异步的，这意味着在启动的内核完成之前，控制权会立即返回给调用线程。

您可能认为这__syncthreads()是强制子内核完成的适当障碍。它不是。在这种情况下使用的正确屏障与在主机上使用的屏障相同： cudaDeviceSynchronize()

当我在您cudaDeviceSynchronize()的第一个之前添加 a 时，我会得到我认为您期望的结果。__syncthreads()kernel

另请注意，主内核中的流创建没有做任何有用的事情。您没有在任何地方明确使用该流。

这是您的代码的略微简化版本，具有上述修复和测试运行：

$ cat t68.cu
#include <stdio.h>

__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
    int index=threadIdx.x+blockIdx.x*blockDim.x;
    d_temp[parentIndex*3+index]=I[parentIndex];
}

__global__ void kernel(double *d_I,double *d_temp,double *d_cin){

    int index=threadIdx.x+blockIdx.x*blockDim.x;
    int i;
    double res=0.0;
    if(index<30){
      dim3 dimBlock(3,1,1);
      dim3 dimGrid(1,1,1);
      innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
      cudaDeviceSynchronize();
      __syncthreads();
      if(index==0){
        for(i=0;i<90;i++)
            printf("temp[%d]: %f\n",i,d_temp[i]);
      }
      for (i=0;i<3;i++){
            res=res+d_temp[index*3+i];
      }
      __syncthreads();
      d_cin[index]=res;
    }
}

int main(int argc,char **argv){

    double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
    double *d_I;
    double *d_temp;
    double *d_cin;
    double cout[30];

    cudaMalloc(&d_I,30*sizeof(double));
    cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);

    cudaMalloc(&d_temp,3*30*sizeof(double));

    cudaMalloc(&d_cin,30*sizeof(double));

    dim3 dimBlock(8,1,1);
    dim3 dimGrid(4,1,1);
    /*LAUNCH THE KERNEL*/
    printf("Before the kernel\n");
    kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
    //cudaThreadSynchronize();
    cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
    printf("After the kernel\n");

    int i;
    for(i=0;i<30;i++)
       printf("%f\n",cout[i]);

}
$ nvcc -rdc=true -arch=sm_35 -o t68 t68.cu -lcudadevrt
$ ./t68
Before the kernel
temp[0]: 1.000000
temp[1]: 1.000000
temp[2]: 1.000000
temp[3]: 2.000000
temp[4]: 2.000000
temp[5]: 2.000000
temp[6]: 3.000000
temp[7]: 3.000000
temp[8]: 3.000000
temp[9]: 4.000000
temp[10]: 4.000000
temp[11]: 4.000000
temp[12]: 5.000000
temp[13]: 5.000000
temp[14]: 5.000000
temp[15]: 6.000000
temp[16]: 6.000000
temp[17]: 6.000000
temp[18]: 7.000000
temp[19]: 7.000000
temp[20]: 7.000000
temp[21]: 8.000000
temp[22]: 8.000000
temp[23]: 8.000000
temp[24]: 9.000000
temp[25]: 9.000000
temp[26]: 9.000000
temp[27]: 10.000000
temp[28]: 10.000000
temp[29]: 10.000000
temp[30]: 11.000000
temp[31]: 11.000000
temp[32]: 11.000000
temp[33]: 12.000000
temp[34]: 12.000000
temp[35]: 12.000000
temp[36]: 13.000000
temp[37]: 13.000000
temp[38]: 13.000000
temp[39]: 14.000000
temp[40]: 14.000000
temp[41]: 14.000000
temp[42]: 15.000000
temp[43]: 15.000000
temp[44]: 15.000000
temp[45]: 16.000000
temp[46]: 16.000000
temp[47]: 16.000000
temp[48]: 17.000000
temp[49]: 17.000000
temp[50]: 17.000000
temp[51]: 18.000000
temp[52]: 18.000000
temp[53]: 18.000000
temp[54]: 19.000000
temp[55]: 19.000000
temp[56]: 19.000000
temp[57]: 20.000000
temp[58]: 20.000000
temp[59]: 20.000000
temp[60]: 21.000000
temp[61]: 21.000000
temp[62]: 21.000000
temp[63]: 22.000000
temp[64]: 22.000000
temp[65]: 22.000000
temp[66]: 23.000000
temp[67]: 23.000000
temp[68]: 23.000000
temp[69]: 24.000000
temp[70]: 24.000000
temp[71]: 24.000000
temp[72]: 25.000000
temp[73]: 25.000000
temp[74]: 25.000000
temp[75]: 26.000000
temp[76]: 26.000000
temp[77]: 26.000000
temp[78]: 27.000000
temp[79]: 27.000000
temp[80]: 27.000000
temp[81]: 28.000000
temp[82]: 28.000000
temp[83]: 28.000000
temp[84]: 29.000000
temp[85]: 29.000000
temp[86]: 29.000000
temp[87]: 30.000000
temp[88]: 30.000000
temp[89]: 30.000000
After the kernel
3.000000
6.000000
9.000000
12.000000
15.000000
18.000000
21.000000
24.000000
27.000000
30.000000
33.000000
36.000000
39.000000
42.000000
45.000000
48.000000
51.000000
54.000000
57.000000
60.000000
63.000000
66.000000
69.000000
72.000000
75.000000
78.000000
81.000000
84.000000
87.000000
90.000000
$

cuda - CUDA 中的动态并行性不起作用

1 回答 1

Related

Reference