0

出于某种原因,我在特定内核中设置的断点被完全忽略了......我已经检查了错误状态cudaGetLastError(),它告诉我一切运行正常,所以我很确定这应该意味着内核已经执行。放置printf语句也不会产生额外的信息,因为没有打印任何内容。即使在进入调试模式的内核中,调用printf也没有效果。这里有什么问题?!

我们在 Tesla M2075(驱动程序版本 295.41)上运行 Cuda 4.2。调试时输出:

(cuda-gdb) break cudaCalcBeamIntersect
Breakpoint 1 at 0x401cfb: file cudacalcbeamintersect.cu, line 109.
(cuda-gdb) r
Starting program: /home/heit/cuda/vfind/vfind singleevent.txt 1 1 1 
[Thread debugging using libthread_db enabled]
[New Thread 0x7ffff5dd5700 (LWP 20241)]
[Context Create of context 0x634220 on Device 0]
[Launch of CUDA Kernel 0 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 2 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 3 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 4 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 5 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
[Launch of CUDA Kernel 7 (cudaCalcBeamIntersect<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
Elapsed time: 0.876842 seconds.
[Thread 0x7ffff5dd5700 (LWP 20241) exited]
[Termination of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]

Program exited normally.

“无错误”打印通过调用在内核外部打印cout << cudaGetErrorString(cudaGetLastError()) << '\n';,并指示两者cudaInitializeGlobals()(可以在 cuda-gdb 中单步执行)并且cudaCalcBeamIntersect()没有问题地执行。然而,后者不能被调试。

有问题的内核仍然是一个初步的内核,并计算一些值以存储在(静态)全局内存中。对这些值没有做任何其他事情,所以编译器会完全优化这个调用吗?如果是,为什么??!!以及如何防止这种行为?(-O0 无效)

干杯!

编辑 - 代码:

** 调用内核的代码 **

    uint const nEvents = events.size();     // total number of events

    /* Not important ... */

// Allocate memory to hold the events
    Track *dev_events;                      
    cudaMalloc(&dev_events, linearEvents.size() * sizeof(Track));

// Copy all events to the GPU
    cudaMemcpy(dev_events, &linearEvents[0], linearEvents.size() * sizeof(Track), cudaMemcpyHostToDevice);

// Initialize the global data, like the histogram and the array of z-values
    cudaInitializeGlobals <<< tpb, bpg >>> ();
    cout << cudaGetErrorString(cudaGetLastError()) << '\n';

    cout << "Processing " << nEvents << " event(s)\n";
    uint linearIdx = 0;
    for (uint event = 0; event != nEvents; ++event)
    {
        uint nTracks = events[event].size();

        if (nTracks > MAX_NUMBER_OF_TRACKS)
        {
            cout << "Number of tracks in event " << event << " exceeds maximum number of tracks.\n";
            exit(1);
        }

        cudaCalcBeamIntersect <<< tpb, bpg >>> (dev_events + linearIdx, nTracks, bipThresh, binWidth);
        cout << cudaGetErrorString(cudaGetLastError()) << '\n';

    // Update linear index
        linearIdx += nTracks;
    }

cudacalcbeamintersect.cu

#include "vfind.cuh"

__device__ float    dev_zMin;
__device__ float    dev_zMax;
__device__ float    dev_zValues[MAX_NUMBER_OF_TRACKS];
__device__ uint     dev_histogram[MAX_NUMBER_OF_BINS];

__constant__ Track dev_beam = 
{
    {0, 0, 1},
    {0, 0, 0}
};

__global__ void cudaInitializeGlobals()
{
    uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint const nThreads = blockDim.x * gridDim.x;

    if (tid == 0)
    {
        dev_zMin = 1e6;
        dev_zMax = -1e6;
    }

    uint idx = tid;
    while (idx < MAX_NUMBER_OF_BINS || idx < MAX_NUMBER_OF_TRACKS)          
    {
        if (idx < MAX_NUMBER_OF_BINS)
            dev_histogram[idx] = 0;

        if (idx < MAX_NUMBER_OF_TRACKS)
            dev_zValues[idx] = 0;

        idx += nThreads;
    }
}

__device__ float dot(float const v1[3], float const v2[3])
{
    // Stuff
}

__device__ float distance(Track const &t1, Track const &t2)
{
    // Even more boring unimportant stuff
}

__device__ Vertex vertex(Track const &t1, Track const &t2)
{
    // Yet even more boring unimportant stuff
}

__global__ void cudaCalcBeamIntersect(Track const *tracks, uint nTracks, float bipTresh, float binWidth)
{
    uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
    uint const nThreads = blockDim.x * gridDim.x;

    uint idx = tid;
    while (idx < nTracks)
    {
        float dist = distance(tracks[idx], dev_beam);
        if (dist < bipTresh)
        {
            float z = vertex(tracks[idx], dev_beam).z;

            if (z < dev_zMin)
                atomicExch(&dev_zMin, z);

            if (z > dev_zMax)
                atomicExch(&dev_zMax, z);

            dev_zValues[idx] = z;
        }

        idx += nThreads;
    }

    __syncthreads();

    // To be continued here
}
4

1 回答 1

1

@JorenHeit 您的内核cudaCalcBeamIntersect具有全局内存副作用,不应进行优化。根据发布的 cuda-gdb 输出,看起来启动工作的主机线程没有等待工作完成(通过cudaDeviceSynchronize()调用或通过cudaMemcpy从设备到主机)。结果,主机线程cudaCalcBeamIntersect kernel在 GPU 上执行之前就退出了。请尝试cudaDeviceSynchronize()在您的应用程序中每次内核启动后添加一个调用。

于 2013-02-19T18:24:01.490 回答