有没有办法使用 cuda-gdb 调用内核故障?
是的,这是可能的。这是一个完整的示例:
$ cat t678.cu
#include <stdio.h>
__global__ void kernel(int *data){
int idx = 0; // line 4
idx += data[0];
int tval = data[idx];
data[1] = tval;
}
int main(){
int *d_data;
cudaMalloc(&d_data, 32*sizeof(int));
cudaMemset(d_data, 0, 32*sizeof(int));
kernel<<<1,1>>>(d_data);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("kernel fail %s\n", cudaGetErrorString(err));
}
$ nvcc -g -G -o t678 t678.cu
$ cuda-gdb ./t678
NVIDIA (R) CUDA Debugger
7.5 release
Portions Copyright (C) 2007-2015 NVIDIA Corporation
GNU gdb (GDB) 7.6.2
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /home/user2/misc/t678...done.
(cuda-gdb) break t678.cu:4
Breakpoint 1 at 0x4026d5: file t678.cu, line 4.
(cuda-gdb) run
Starting program: /home/user2/misc/./t678
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7ffff700a700 (LWP 8693)]
[Switching focus to CUDA kernel 0, grid 2, block (0,0,0), thread (0,0,0), device 0, sm 14, warp 2, lane 0]
Breakpoint 1, kernel<<<(1,1,1),(1,1,1)>>> (data=0x13047a0000) at t678.cu:4
4 int idx = 0; // line 4
(cuda-gdb) step
5 idx += data[0];
(cuda-gdb) print idx
$1 = 0
(cuda-gdb) set idx=1000000
(cuda-gdb) step
6 int tval = data[idx];
(cuda-gdb) print idx
$2 = 1000000
(cuda-gdb) step
CUDA Exception: Device Illegal Address
The exception was triggered in device 0.
Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
kernel<<<(1,1,1),(1,1,1)>>> (data=0x13047a0000) at t678.cu:7
7 data[1] = tval;
(cuda-gdb)
在上面的 cuda-gdb 输出中,可以看到在将idx
变量设置为较大的值后,在调试器中执行以下行时会导致 index-out-of-bounds (illegal address) 错误:
int tval = data[idx];