cuda - cuda 内核不能同时执行

Question

我正在尝试探索具有 2.0 功能的 Nvidia Quadro 4000 的并发内核执行属性。

我使用 2 个不同的流，它们的运行方式如下：

复制 H2D 两个不同的固定内存块
运行内核
Copyt D2H 两个不同的块到固定内存。

两个流的内核完全相同，每个都有 190 毫秒的执行时间。

在 Visual profiler（5.0 版）中，我希望两个内核同时开始执行，但它们仅重叠 20 毫秒。这是代码示例：

enter code here

//initiate the streams
        cudaStream_t stream0,stream1;
        CHK_ERR(cudaStreamCreate(&stream0));
        CHK_ERR(cudaStreamCreate(&stream1));
        //allocate the memory on the GPU for stream0
        CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        //allocate the memory on the GPU for stream1
        CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));

        //allocate page-locked memory for stream0
        CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));

        //allocate page-locked memory for stream1
        CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));


        memcpy(host01,in1,width*height*sizeof(char));
        memcpy(host02,in2,width*height*sizeof(char));

        memcpy(host11,in1,width*height*sizeof(char));
        memcpy(host12,in2,width*height*sizeof(char));



        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
        dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);

        cudaEventRecord(start,0);
        // --------------------
        // Copy images to device
        // --------------------
        //enqueue copies of def stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
        //enqueue copies of ref stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));

        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        //CALLING KERNEL
        //enqueue kernel in stream0 and stream1
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");


        //Copy result back
        CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));

        CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));


        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        cudaEventRecord( stop, 0 );
        cudaEventSynchronize( stop );
        cudaEventElapsedTime( &time, start, stop );
        printf("Elapsed time=%f ms\n",time);

        memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
        memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));


        cudaEventDestroy( start ); 
        cudaEventDestroy( stop );
        CHK_ERR(cudaStreamDestroy(stream0));
        CHK_ERR(cudaStreamDestroy(stream1));

        CHK_ERR(cudaDeviceReset());


    }

score 3 · Accepted Answer

计算能力 2.x-3.0

计算能力 2.x-3.0 设备有一个硬件工作队列。CUDA 驱动程序将命令推送到工作队列中。GPU 主机读取命令并将工作分派给复制引擎或 CUDA 工作分配器 (CWD)。CUDA 驱动程序将同步命令插入到硬件工作队列中，以保证同一流上的工作不能同时运行。当主机点击同步命令时，它将停止，直到相关工作完成。

当网格太小而无法填满整个 GPU 或网格具有尾部效应（线程块的子集执行时间比其他线程块长得多）时，并发内核执行可提高 GPU 利用率。

案例 1：一个流上的背靠背内核

如果应用程序在同一个流上背靠背启动两个内核，CUDA 驱动程序插入的同步命令将不会将第二个内核分派给 CWD，直到第一个内核完成。

案例 2：背靠背内核在两个流上启动

如果应用程序在不同的流上启动两个内核，主机将读取命令并将命令分派给 CWD。CWD 将光栅化第一个网格（顺序取决于架构）并将线程块分派给 SM。只有当第一个网格中的所有线程块都已被分派时，CWD 才会从第二个网格中分派线程块。

计算能力 3.5

计算能力 3.5 引入了几个新功能来提高 GPU 利用率。其中包括： - HyperQ 支持多个独立的硬件工作队列。- 动态并行允许设备代码启动新工作。- CWD 容量增加到 32 个网格。

资源

cuda - cuda 内核不能同时执行

1 回答 1

Related

Reference