您的示例代码已被编译器完全优化掉,因为没有任何代码有助于全局内存写入。这很容易通过将内核编译为 cubin 文件并使用以下命令反汇编结果来证明cuobjdump
:
$ nvcc -arch=sm_20 -Xptxas="-v" -cubin struct.cu
ptxas info : Compiling entry function '_Z8myKernelv' for 'sm_20'
ptxas info : Function properties for _Z8myKernelv
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 2 registers, 32 bytes cmem[0]
$ cuobjdump -sass struct_dumb.cubin
code for sm_20
Function : _Z8myKernelv
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00001de780000000*/ EXIT;
.............................
IE。内核完全是空的。调试器无法调试您要调查的代码,因为它在编译器/汇编器发出的内容中不存在。如果我们对您的代码有一些自由:
typedef struct __align__(8) {
int m_x0;
int m_tx;
} myStruct;
__device__ __noinline__ void foo(myStruct *st) {
st->m_tx = 99;
st->m_x0 = 123;
}
__global__ void myKernel(int dowrite, int *output){
myStruct m_struct ;
m_struct.m_tx = 45;
m_struct.m_x0 = 90;
if (dowrite) {
foo(&m_struct);
output[threadIdx.x] = m_struct.m_tx + m_struct.m_x0;
}
}
int main(void) {
int * output;
cudaMalloc((void **)(&output), sizeof(int));
myKernel <<<1,1 >>>(1, output);
cudaThreadSynchronize();
return 0;
}
并重复相同的编译和反汇编步骤,事情看起来有些不同:
$ nvcc -arch=sm_20 -Xptxas="-v" -cubin struct_dumb.cu
ptxas info : Compiling entry function '_Z8myKerneliPi' for 'sm_20'
ptxas info : Function properties for _Z8myKerneliPi
8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Function properties for _Z3fooP8myStruct
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 5 registers, 40 bytes cmem[0]
$ /usr/local/cuda/bin/cuobjdump -sass struct_dumb.cubin
code for sm_20
Function : _Z8myKerneliPi
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x20105d034800c000*/ IADD R1, R1, -0x8;
/*0010*/ /*0x68009de218000001*/ MOV32I R2, 0x5a;
/*0018*/ /*0xb400dde218000000*/ MOV32I R3, 0x2d;
/*0020*/ /*0x83f1dc23190e4000*/ ISETP.EQ.AND P0, pt, RZ, c [0x0] [0x20], pt;
/*0028*/ /*0x00101c034800c000*/ IADD R0, R1, 0x0;
/*0030*/ /*0x00109ca5c8000000*/ STL.64 [R1], R2;
/*0038*/ /*0x000001e780000000*/ @P0 EXIT;
/*0040*/ /*0x10011c0348004000*/ IADD R4, R0, c [0x0] [0x4];
/*0048*/ /*0xc001000750000000*/ CAL 0x80;
/*0050*/ /*0x00009ca5c0000000*/ LDL.64 R2, [R0];
/*0058*/ /*0x84011c042c000000*/ S2R R4, SR_Tid_X;
/*0060*/ /*0x90411c4340004000*/ ISCADD R4, R4, c [0x0] [0x24], 0x2;
/*0068*/ /*0x0c201c0348000000*/ IADD R0, R2, R3;
/*0070*/ /*0x00401c8590000000*/ ST [R4], R0;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x8c00dde218000001*/ MOV32I R3, 0x63;
/*0088*/ /*0xec009de218000001*/ MOV32I R2, 0x7b;
/*0090*/ /*0x1040dc8590000000*/ ST [R4+0x4], R3;
/*0098*/ /*0x00409c8590000000*/ ST [R4], R2;
/*00a0*/ /*0x00001de790000000*/ RET;
...............................
我们在汇编器输出中得到实际代码。你可能在调试器中有更多的运气。