我已经在两张不同的卡上测试了使用 CUDA 6.0 编译的代码:GT540M (Fermi) 和 Kepler K20c (Kepler),这些是结果
GT540M
Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120
开普勒 K20c
Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120
我没有观察到__threadfence()
针对其他两个考虑案例的任何特别缓慢的行为。
这可以通过使用反汇编代码来证明。
usethreadfence()
c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ S2R R0, SR_TID.X; R0 = threadIdx.x
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; P0 = (R0 != 0)
/*0018*/ S2R R5, SR_CTAID.X; R5 = blockIdx.x
/*0020*/ IMAD R3, R5, 0x3e8, R0; R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
if (threadIdx.x == 0)
/*0028*/ @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2; R2 = scratch + threadIdx.x
/*0030*/ IADD R4, R0, 0x11; R4 = R0 + 17 = threadIdx.x + 17
/*0038*/ ISCADD R3, R3, c[0xe][0x4], 0x2; R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/ @!P0 ST [R2], R5; scratch[threadIdx.x] = blockIdx.x
/*0048*/ ST [R3], R4; junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/ MEMBAR.GL; __threadfence
/*0058*/ @P0 BRA.U 0x98; if (threadIdx.x != 0) branch to 0x98
if (threadIdx.x == 0)
/*0060*/ @!P0 MOV R2, c[0xe][0xc]; R2 = &count
/*0068*/ @!P0 MOV R3, c[0x0][0x14]; R3 = gridDim.x
/*0070*/ @!P0 ATOM.INC R2, [R2], R3; R2 = value = count + 1; *(&count) ++
/*0078*/ @!P0 IADD R3, R3, -0x1; R3 = R3 - 1 = gridDim.x - 1
/*0080*/ @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT; P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/ @!P0 SEL R2, RZ, 0x1, !P1; if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/ @!P0 STS.U8 [RZ], R2; Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*00a8*/ LDS.U8 R0, [RZ]; R0 = R2 = isLastBlockDone
/*00b0*/ ISETP.NE.AND P0, PT, R0, RZ, P0; P0 = (R0 == 0)
/*00b8*/ @!P0 EXIT; if (isLastBlockDone != 0) exits
/*00c0*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/ MOV R0, RZ;
/*00d0*/ @!P0 BRA 0x1b8;
/*00d8*/ MOV R2, c[0x0][0x14];
/*00e0*/ ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/ MOV R2, RZ;
/*00f0*/ @!P0 BRA 0x170;
/*00f8*/ MOV R3, c[0x0][0x14];
/*0100*/ IADD R7, R3, -0x3;
/*0108*/ NOP;
/*0110*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/ IADD R2, R2, 0x4;
/*0120*/ LD R4, [R3];
/*0128*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/ LD R5, [R3+0x4];
/*0138*/ LD R6, [R3+0x8];
/*0140*/ LD R3, [R3+0xc];
/*0148*/ IADD R0, R4, R0;
/*0150*/ IADD R0, R5, R0;
/*0158*/ IADD R0, R6, R0;
/*0160*/ IADD R0, R3, R0;
/*0168*/ @P0 BRA 0x110;
/*0170*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/ @!P0 BRA 0x1b8;
/*0180*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/ IADD R2, R2, 0x1;
/*0190*/ LD R3, [R3];
/*0198*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/ NOP;
/*01a8*/ IADD R0, R3, R0;
/*01b0*/ @P0 BRA 0x180;
/*01b8*/ MOV R2, c[0xe][0x8];
/*01c0*/ ST [R2], R0;
/*01c8*/ EXIT;
justthreadfence()
Function : _Z15justthreadfencev
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0010*/ ISETP.NE.AND P0, PT, R3, RZ, PT; /* 0x1a8e0000fc31dc23 */
/*0018*/ S2R R4, SR_CTAID.X; /* 0x2c00000094011c04 */
/*0020*/ IMAD R2, R4, 0x3e8, R3; /* 0x2006c00fa0409ca3 */
/*0028*/ @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2; /* 0x4000780000402043 */
/*0030*/ IADD R3, R3, 0x11; /* 0x4800c0004430dc03 */
/*0038*/ ISCADD R2, R2, c[0xe][0x4], 0x2; /* 0x4000780010209c43 */
/*0040*/ @!P0 ST [R0], R4; /* 0x9000000000012085 */
/*0048*/ ST [R2], R3; /* 0x900000000020dc85 */
/*0050*/ MEMBAR.GL; /* 0xe000000000001c25 */
/*0058*/ EXIT; /* 0x8000000000001de7 */
usetwokernels_1()
Function : _Z15usetwokernels_1v
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ S2R R2, SR_CTAID.X; /* 0x2c00000094009c04 */
/*0020*/ IMAD R4, R2, 0x3e8, R0; /* 0x2000c00fa0211ca3 */
/*0028*/ @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020e043 */
/*0030*/ IADD R0, R0, 0x11; /* 0x4800c00044001c03 */
/*0038*/ ISCADD R4, R4, c[0xe][0x4], 0x2; /* 0x4000780010411c43 */
/*0040*/ @!P0 ST [R3], R2; /* 0x900000000030a085 */
/*0048*/ ST [R4], R0; /* 0x9000000000401c85 */
/*0050*/ EXIT; /* 0x8000000000001de7 */
.....................................
usetwokernels_1()
Function : _Z15usetwokernels_2v
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ @P0 EXIT; /* 0x80000000000001e7 */
/*0020*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; /* 0x1a8e400053f1dc23 */
/*0028*/ MOV R0, RZ; /* 0x28000000fc001de4 */
/*0030*/ @!P0 BRA 0x130; /* 0x40000003e00021e7 */
/*0038*/ MOV R2, c[0x0][0x14]; /* 0x2800400050009de4 */
/*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; /* 0x1a0ec0000c21dc23 */
/*0048*/ MOV R2, RZ; /* 0x28000000fc009de4 */
/*0050*/ @!P0 BRA 0xe0; /* 0x40000002200021e7 */
/*0058*/ MOV R3, c[0x0][0x14]; /* 0x280040005000dde4 */
/*0060*/ IADD R7, R3, -0x3; /* 0x4800fffff431dc03 */
/*0068*/ NOP; /* 0x4000000000001de4 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ NOP; /* 0x4000000000001de4 */
/*0080*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0088*/ LD R4, [R3]; /* 0x8000000000311c85 */
/*0090*/ IADD R2, R2, 0x4; /* 0x4800c00010209c03 */
/*0098*/ LD R5, [R3+0x4]; /* 0x8000000010315c85 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT; /* 0x188e00001c21dc03 */
/*00a8*/ LD R6, [R3+0x8]; /* 0x8000000020319c85 */
/*00b0*/ LD R3, [R3+0xc]; /* 0x800000003030dc85 */
/*00b8*/ IADD R0, R4, R0; /* 0x4800000000401c03 */
/*00c0*/ IADD R0, R5, R0; /* 0x4800000000501c03 */
/*00c8*/ IADD R0, R6, R0; /* 0x4800000000601c03 */
/*00d0*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*00d8*/ @P0 BRA 0x80; /* 0x4003fffe800001e7 */
/*00e0*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*00e8*/ @!P0 BRA 0x130; /* 0x40000001000021e7 */
/*00f0*/ NOP; /* 0x4000000000001de4 */
/*00f8*/ NOP; /* 0x4000000000001de4 */
/*0100*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0108*/ IADD R2, R2, 0x1; /* 0x4800c00004209c03 */
/*0110*/ LD R3, [R3]; /* 0x800000000030dc85 */
/*0118*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*0120*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*0128*/ @P0 BRA 0x100; /* 0x4003ffff400001e7 */
/*0130*/ MOV R2, c[0xe][0x8]; /* 0x2800780020009de4 */
/*0138*/ ST [R2], R0; /* 0x9000000000201c85 */
/*0140*/ EXIT; /* 0x8000000000001de7 */
.....................................
可以看出, 的指令justthreadfencev()
严格包含在的指令中usethreadfence()
,而usetwokernels_1()
和的指令实际上是 的指令的usetwokernels_2()
划分justthreadfencev()
。因此,时间上的差异可以归因于第二个内核的内核启动开销。