cuda - 我的 CUDA 内核中的 printf() 不会产生任何输出

Question

我printf()在我的 CUDA 程序中添加了一些语句

__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
    //Kernel call
    printf("calling kernel\n");
    Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
    printf("kernel called\n");
  ....
}

int main(int argc , char **argv)
{   ....
    printf("beforeDeviceFunc\n\n");
    DeviceFunc(a_h , numvar , b_h); //Showing the data
    printf("after DeviceFunc\n\n");
    ....
}

同样在 Kernel.cu 中，我写道：

#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
    int idx = threadIdx.x ;
    int idy = threadIdx.y ;
    //Allocating memory in the share memory of the device
    __shared__ float temp[16][16];

    //Copying the data to the shared memory
    temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
    printf("idx=%d, idy=%d, size=%d", idx, idy, size);
    ....
}

然后我像这样编译-arch=sm_20：

nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main

现在，当我运行程序时，我看到：

beforeDeviceFunc

calling kernel
kernel called
after DeviceFunc

所以printf()内核内部没有打印出来。我该如何解决？

score 14 · Accepted Answer

printf()仅当内核成功完成时才会显示输出，因此请检查所有 CUDA 函数调用的返回码并确保没有报告错误。

此外printf()，输出仅显示在程序中的某些点。编程指南的附录 B.32.2将这些列为

<<<>>>通过or启动内核cuLaunchKernel()（在启动开始时，如果 CUDA_LAUNCH_BLOCKING 环境变量设置为 1，也在启动结束时），
cudaDeviceSynchronize()通过, cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), 或cuEventSynchronize(),进行同步
cudaMemcpy*()通过或的任何阻塞版本进行内存复制cuMemcpy*()，
cuModuleLoad()通过或加载/卸载模块cuModuleUnload()，
cudaDeviceReset()通过或进行上下文破坏cuCtxDestroy()。
在执行由cudaStreamAddCallback()or添加的流回调之前cuStreamAddCallback()。

要检查这是您的问题，请在内核调用之后添加以下代码：

{
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\".\n",
               cudaGetErrorString(cudaerr));
}

然后，您应该会看到内核的输出或错误消息。

更方便的是，cuda-memcheck如果您在其下运行可执行文件，它将自动为您检查所有返回码。虽然您应该始终检查错误，但这在解决具体问题时会派上用场。

score 0 · Accepted Answer

我刚才遇到了同样的错误，将块大小减小到 512 有所帮助。根据文档，最大块大小可以是 512 或 1024。

我编写了一个简单的测试，表明我的 GTX 1070 的最大块大小为 1024。 UPD：您可以使用 cudaError_t cudaPeekAtLastError() 检查您的内核是否曾经执行过，如果内核已成功启动，则返回 cudaSuccess，并且仅在更糟糕的是调用 cudaError_t cudaDeviceSynchronize()。

测试块大小为 1023

测试块大小为 1024

测试块大小为 1025

CUDA 错误：配置参数无效

块最大大小为 1024

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

__global__
void set1(int* t)
{
    t[threadIdx.x] = 1;
}

inline bool failed(cudaError_t error)
{
    if (cudaSuccess == error)
        return false;

    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
    return true;
}

int main()
{
    int blockSize;
    for (blockSize = 1; blockSize < 1 << 12; blockSize++)
    {
        printf("Testing block size of %d\n", blockSize);
        int* t;
        if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
        {
            failed(cudaFree(t));
            break;
        }
        for (int i = 0; i < blockSize; i++)
            t[0] = 0;
        set1 <<<1, blockSize>>> (t);
        if (failed(cudaPeekAtLastError()))
        {
            failed(cudaFree(t));
            break;
        }
        if (failed(cudaDeviceSynchronize()))
        {
            failed(cudaFree(t));
            break;
        }

        bool hasError = false;
        for (int i = 0; i < blockSize; i++)
            if (1 != t[i])
            {
                printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
                hasError = true;
                break;
            }
        if (hasError)
        {
            failed(cudaFree(t));
            break;
        }

        failed(cudaFree(t));
    }
    blockSize--;
    if(blockSize <= 0)
    {
        printf("CUDA error: block size cannot be 0\n");
        return 1;
    }
    printf("Block maximum size is %d", blockSize);
    return  0;
}

PS 请注意，块大小中唯一的东西是扭曲粒度，现在是 32，所以如果 0 == yourBlockSize % 32，扭曲的使用效率很高。使块大于 32 的唯一原因是当代码需要同步时，因为同步仅在单个块中的线程之间可用，这使得开发人员使用单个大块而不是许多小块。因此，使用更多数量的小块运行可能比使用更少数量的大块运行效率更高。

cuda - 我的 CUDA 内核中的 printf() 不会产生任何输出

2 回答 2

Related

Reference