出于某种原因,我在特定内核中设置的断点被完全忽略了......我已经检查了错误状态cudaGetLastError()
,它告诉我一切运行正常,所以我很确定这应该意味着内核已经执行。放置printf
语句也不会产生额外的信息,因为没有打印任何内容。即使在进入调试模式的内核中,调用printf
也没有效果。这里有什么问题?!
我们在 Tesla M2075(驱动程序版本 295.41)上运行 Cuda 4.2。调试时输出:
(cuda-gdb) break cudaCalcBeamIntersect
Breakpoint 1 at 0x401cfb: file cudacalcbeamintersect.cu, line 109.
(cuda-gdb) r
Starting program: /home/heit/cuda/vfind/vfind singleevent.txt 1 1 1
[Thread debugging using libthread_db enabled]
[New Thread 0x7ffff5dd5700 (LWP 20241)]
[Context Create of context 0x634220 on Device 0]
[Launch of CUDA Kernel 0 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 2 (memset32_post<<<(64,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 3 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 4 (memset32_post<<<(1,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 5 (memset32_post<<<(8,1,1),(64,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
[Launch of CUDA Kernel 7 (cudaCalcBeamIntersect<<<(256,1,1),(128,1,1)>>>) on Device 0]
no error
Elapsed time: 0.876842 seconds.
[Thread 0x7ffff5dd5700 (LWP 20241) exited]
[Termination of CUDA Kernel 6 (cudaInitializeGlobals<<<(256,1,1),(128,1,1)>>>) on Device 0]
Program exited normally.
“无错误”打印通过调用在内核外部打印cout << cudaGetErrorString(cudaGetLastError()) << '\n';
,并指示两者cudaInitializeGlobals()
(可以在 cuda-gdb 中单步执行)并且cudaCalcBeamIntersect()
没有问题地执行。然而,后者不能被调试。
有问题的内核仍然是一个初步的内核,并计算一些值以存储在(静态)全局内存中。对这些值没有做任何其他事情,所以编译器会完全优化这个调用吗?如果是,为什么??!!以及如何防止这种行为?(-O0 无效)
干杯!
编辑 - 代码:
** 调用内核的代码 **
uint const nEvents = events.size(); // total number of events
/* Not important ... */
// Allocate memory to hold the events
Track *dev_events;
cudaMalloc(&dev_events, linearEvents.size() * sizeof(Track));
// Copy all events to the GPU
cudaMemcpy(dev_events, &linearEvents[0], linearEvents.size() * sizeof(Track), cudaMemcpyHostToDevice);
// Initialize the global data, like the histogram and the array of z-values
cudaInitializeGlobals <<< tpb, bpg >>> ();
cout << cudaGetErrorString(cudaGetLastError()) << '\n';
cout << "Processing " << nEvents << " event(s)\n";
uint linearIdx = 0;
for (uint event = 0; event != nEvents; ++event)
{
uint nTracks = events[event].size();
if (nTracks > MAX_NUMBER_OF_TRACKS)
{
cout << "Number of tracks in event " << event << " exceeds maximum number of tracks.\n";
exit(1);
}
cudaCalcBeamIntersect <<< tpb, bpg >>> (dev_events + linearIdx, nTracks, bipThresh, binWidth);
cout << cudaGetErrorString(cudaGetLastError()) << '\n';
// Update linear index
linearIdx += nTracks;
}
cudacalcbeamintersect.cu
#include "vfind.cuh"
__device__ float dev_zMin;
__device__ float dev_zMax;
__device__ float dev_zValues[MAX_NUMBER_OF_TRACKS];
__device__ uint dev_histogram[MAX_NUMBER_OF_BINS];
__constant__ Track dev_beam =
{
{0, 0, 1},
{0, 0, 0}
};
__global__ void cudaInitializeGlobals()
{
uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
uint const nThreads = blockDim.x * gridDim.x;
if (tid == 0)
{
dev_zMin = 1e6;
dev_zMax = -1e6;
}
uint idx = tid;
while (idx < MAX_NUMBER_OF_BINS || idx < MAX_NUMBER_OF_TRACKS)
{
if (idx < MAX_NUMBER_OF_BINS)
dev_histogram[idx] = 0;
if (idx < MAX_NUMBER_OF_TRACKS)
dev_zValues[idx] = 0;
idx += nThreads;
}
}
__device__ float dot(float const v1[3], float const v2[3])
{
// Stuff
}
__device__ float distance(Track const &t1, Track const &t2)
{
// Even more boring unimportant stuff
}
__device__ Vertex vertex(Track const &t1, Track const &t2)
{
// Yet even more boring unimportant stuff
}
__global__ void cudaCalcBeamIntersect(Track const *tracks, uint nTracks, float bipTresh, float binWidth)
{
uint const tid = threadIdx.x + blockIdx.x * blockDim.x;
uint const nThreads = blockDim.x * gridDim.x;
uint idx = tid;
while (idx < nTracks)
{
float dist = distance(tracks[idx], dev_beam);
if (dist < bipTresh)
{
float z = vertex(tracks[idx], dev_beam).z;
if (z < dev_zMin)
atomicExch(&dev_zMin, z);
if (z > dev_zMax)
atomicExch(&dev_zMax, z);
dev_zValues[idx] = z;
}
idx += nThreads;
}
__syncthreads();
// To be continued here
}