cuda - CUDA __threadfence() 同步与两个单独内核调用同步的性能

Question

我试图了解如何使用__threadfence()，因为它似乎是一个强大的同步原语，它可以让不同的块一起工作，而无需经历结束内核和启动新内核的巨大麻烦。CUDA C 编程指南有一个例子（附录 B.5），它在 SDK 的“threadFenceReduction”示例中得到了充实，所以它看起来像是我们“应该”使用的东西。

但是，当我尝试使用时__threadfence()，它的速度非常慢。有关示例，请参见下面的代码。据我了解，__threadfence()在继续之前，应该确保当前线程块的所有待处理内存传输都已完成。我相信，内存延迟比微秒要好一些，因此在 GTX680 上处理包含代码中 64KB 内存传输的总时间应该在微秒左右。相反，该__threadfence()指令似乎需要20几微秒！我可以不使用__threadfence()同步，而是结束内核，并启动一个全新的内核（在相同的默认流中，以便同步），只需不到三分之一的时间！

这里发生了什么？我的代码中是否有我没有注意到的错误？还是__threadfence()真的20x比它应该6x的慢，比整个内核启动+清理慢？

threadfence 内核运行 1000 次的时间：27.716831 ms
答案：120
仅前 3 行（包括 threadfence）的 1000 次运行时间：25.962912 ms
不使用 threadfence 进行同步，通过拆分为两个内核：7.653344 ms
答案：120

#include "cuda.h"
#include <cstdio>

__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__device__ int scratch[16];
__device__ int junk[16000];
__device__ int answer;

__global__ void usethreadfence() //just like the code example in B.5 of the CUDA C Programming Guide
{
    if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
    junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x; //do some more memory writes to make the kernel nontrivial
    __threadfence();

    if (threadIdx.x==0) {
        unsigned int value = atomicInc(&count, gridDim.x);
        isLastBlockDone = (value == (gridDim.x - 1));
    }
    __syncthreads();
    if (isLastBlockDone && threadIdx.x==0) {
    // The last block sums the results stored in scratch[0 .. gridDim.x-1]
        int sum=0;
        for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
        answer=sum;
    }
}

__global__ void justthreadfence() //first three lines of the previous kernel, so we can compare speeds
{
    if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
    junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
    __threadfence();
}

__global__ void usetwokernels_1() //this and the next kernel reproduce the functionality of the first kernel, but faster!
{
    if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
    junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
}

__global__ void usetwokernels_2()
{
    if (threadIdx.x==0) {
        int sum=0;
        for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
        answer=sum;
    }
}

int main() {
    int sum;

    cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
    for (int i=0;i<1000;i++) usethreadfence<<<16,1000>>>();
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of the threadfence kernel: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
    cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);

    cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
    for (int i=0;i<1000;i++) justthreadfence<<<16,1000>>>();
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of just the first 3 lines, including threadfence: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);

    cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
    for (int i=0;i<1000;i++) {usetwokernels_1<<<16,1000>>>(); usetwokernels_2<<<16,1000>>>();}
    cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Synchronizing without threadfence, by splitting to two kernels: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
    cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
}

score 1 · Accepted Answer

我已经在两张不同的卡上测试了使用 CUDA 6.0 编译的代码：GT540M (Fermi) 和 Kepler K20c (Kepler)，这些是结果

GT540M

Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120

开普勒 K20c

Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120

我没有观察到__threadfence()针对其他两个考虑案例的任何特别缓慢的行为。

这可以通过使用反汇编代码来证明。

usethreadfence()

c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x

/*0000*/         MOV R1, c[0x1][0x100];                                     
/*0008*/         S2R R0, SR_TID.X;                                          R0 = threadIdx.x
/*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                           P0 = (R0 != 0)
/*0018*/         S2R R5, SR_CTAID.X;                                        R5 = blockIdx.x
/*0020*/         IMAD R3, R5, 0x3e8, R0;                                    R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
                                                                        if (threadIdx.x == 0)
/*0028*/    @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2;                               R2 = scratch + threadIdx.x                           
/*0030*/         IADD R4, R0, 0x11;                                             R4 = R0 + 17 = threadIdx.x + 17
/*0038*/         ISCADD R3, R3, c[0xe][0x4], 0x2;                               R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/    @!P0 ST [R2], R5;                                                   scratch[threadIdx.x] = blockIdx.x
/*0048*/         ST [R3], R4;                                                   junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/         MEMBAR.GL;                                                     __threadfence
/*0058*/     @P0 BRA.U 0x98;                                                if (threadIdx.x != 0) branch to 0x98
                                                                        if (threadIdx.x == 0)
/*0060*/    @!P0 MOV R2, c[0xe][0xc];                                           R2 = &count
/*0068*/    @!P0 MOV R3, c[0x0][0x14];                                          R3 = gridDim.x
/*0070*/    @!P0 ATOM.INC R2, [R2], R3;                                         R2 = value = count + 1; *(&count) ++ 
/*0078*/    @!P0 IADD R3, R3, -0x1;                                             R3 = R3 - 1 = gridDim.x - 1
/*0080*/    @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT;                               P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/    @!P0 SEL R2, RZ, 0x1, !P1;                                          if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/    @!P0 STS.U8 [RZ], R2;                                               Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/         ISETP.EQ.AND P0, PT, R0, RZ, PT;                           P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/         BAR.RED.POPC RZ, RZ, RZ, PT;                               __syncthreads()
/*00a8*/         LDS.U8 R0, [RZ];                                           R0 = R2 = isLastBlockDone
/*00b0*/         ISETP.NE.AND P0, PT, R0, RZ, P0;                           P0 = (R0 == 0)
/*00b8*/    @!P0 EXIT;                                                      if (isLastBlockDone != 0) exits
/*00c0*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;                 IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/         MOV R0, RZ;
/*00d0*/    @!P0 BRA 0x1b8;
/*00d8*/         MOV R2, c[0x0][0x14];
/*00e0*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/         MOV R2, RZ;
/*00f0*/    @!P0 BRA 0x170;
/*00f8*/         MOV R3, c[0x0][0x14];
/*0100*/         IADD R7, R3, -0x3;
/*0108*/         NOP;
/*0110*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/         IADD R2, R2, 0x4;
/*0120*/         LD R4, [R3];
/*0128*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/         LD R5, [R3+0x4];
/*0138*/         LD R6, [R3+0x8];
/*0140*/         LD R3, [R3+0xc];
/*0148*/         IADD R0, R4, R0;
/*0150*/         IADD R0, R5, R0;
/*0158*/         IADD R0, R6, R0;
/*0160*/         IADD R0, R3, R0;
/*0168*/     @P0 BRA 0x110;
/*0170*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/    @!P0 BRA 0x1b8;
/*0180*/         ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/         IADD R2, R2, 0x1;
/*0190*/         LD R3, [R3];
/*0198*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/         NOP;
/*01a8*/         IADD R0, R3, R0;
/*01b0*/     @P0 BRA 0x180;
/*01b8*/         MOV R2, c[0xe][0x8];
/*01c0*/         ST [R2], R0;
/*01c8*/         EXIT;

justthreadfence()

    Function : _Z15justthreadfencev
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R3, SR_TID.X;                      /* 0x2c0000008400dc04 */
    /*0010*/         ISETP.NE.AND P0, PT, R3, RZ, PT;       /* 0x1a8e0000fc31dc23 */
    /*0018*/         S2R R4, SR_CTAID.X;                    /* 0x2c00000094011c04 */
    /*0020*/         IMAD R2, R4, 0x3e8, R3;                /* 0x2006c00fa0409ca3 */
    /*0028*/    @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2;       /* 0x4000780000402043 */
    /*0030*/         IADD R3, R3, 0x11;                     /* 0x4800c0004430dc03 */
    /*0038*/         ISCADD R2, R2, c[0xe][0x4], 0x2;       /* 0x4000780010209c43 */
    /*0040*/    @!P0 ST [R0], R4;                           /* 0x9000000000012085 */
    /*0048*/         ST [R2], R3;                           /* 0x900000000020dc85 */
    /*0050*/         MEMBAR.GL;                             /* 0xe000000000001c25 */
    /*0058*/         EXIT;                                  /* 0x8000000000001de7 */

usetwokernels_1()

    Function : _Z15usetwokernels_1v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                 /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                      /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;       /* 0x1a8e0000fc01dc23 */
    /*0018*/         S2R R2, SR_CTAID.X;                    /* 0x2c00000094009c04 */
    /*0020*/         IMAD R4, R2, 0x3e8, R0;                /* 0x2000c00fa0211ca3 */
    /*0028*/    @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2;       /* 0x400078000020e043 */
    /*0030*/         IADD R0, R0, 0x11;                     /* 0x4800c00044001c03 */
    /*0038*/         ISCADD R4, R4, c[0xe][0x4], 0x2;       /* 0x4000780010411c43 */
    /*0040*/    @!P0 ST [R3], R2;                           /* 0x900000000030a085 */
    /*0048*/         ST [R4], R0;                           /* 0x9000000000401c85 */
    /*0050*/         EXIT;                                  /* 0x8000000000001de7 */
    .....................................

usetwokernels_1()

    Function : _Z15usetwokernels_2v
.headerflags    @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R0, SR_TID.X;                               /* 0x2c00000084001c04 */
    /*0010*/         ISETP.NE.AND P0, PT, R0, RZ, PT;                /* 0x1a8e0000fc01dc23 */
    /*0018*/     @P0 EXIT;                                           /* 0x80000000000001e7 */
    /*0020*/         ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT;      /* 0x1a8e400053f1dc23 */
    /*0028*/         MOV R0, RZ;                                     /* 0x28000000fc001de4 */
    /*0030*/    @!P0 BRA 0x130;                                      /* 0x40000003e00021e7 */
    /*0038*/         MOV R2, c[0x0][0x14];                           /* 0x2800400050009de4 */
    /*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;               /* 0x1a0ec0000c21dc23 */
    /*0048*/         MOV R2, RZ;                                     /* 0x28000000fc009de4 */
    /*0050*/    @!P0 BRA 0xe0;                                       /* 0x40000002200021e7 */
    /*0058*/         MOV R3, c[0x0][0x14];                           /* 0x280040005000dde4 */
    /*0060*/         IADD R7, R3, -0x3;                              /* 0x4800fffff431dc03 */
    /*0068*/         NOP;                                            /* 0x4000000000001de4 */
    /*0070*/         NOP;                                            /* 0x4000000000001de4 */
    /*0078*/         NOP;                                            /* 0x4000000000001de4 */
    /*0080*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0088*/         LD R4, [R3];                                    /* 0x8000000000311c85 */
    /*0090*/         IADD R2, R2, 0x4;                               /* 0x4800c00010209c03 */
    /*0098*/         LD R5, [R3+0x4];                                /* 0x8000000010315c85 */
    /*00a0*/         ISETP.LT.U32.AND P0, PT, R2, R7, PT;            /* 0x188e00001c21dc03 */
    /*00a8*/         LD R6, [R3+0x8];                                /* 0x8000000020319c85 */
    /*00b0*/         LD R3, [R3+0xc];                                /* 0x800000003030dc85 */
    /*00b8*/         IADD R0, R4, R0;                                /* 0x4800000000401c03 */
    /*00c0*/         IADD R0, R5, R0;                                /* 0x4800000000501c03 */
    /*00c8*/         IADD R0, R6, R0;                                /* 0x4800000000601c03 */
    /*00d0*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*00d8*/     @P0 BRA 0x80;                                       /* 0x4003fffe800001e7 */
    /*00e0*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*00e8*/    @!P0 BRA 0x130;                                      /* 0x40000001000021e7 */
    /*00f0*/         NOP;                                            /* 0x4000000000001de4 */
    /*00f8*/         NOP;                                            /* 0x4000000000001de4 */
    /*0100*/         ISCADD R3, R2, c[0xe][0x0], 0x2;                /* 0x400078000020dc43 */
    /*0108*/         IADD R2, R2, 0x1;                               /* 0x4800c00004209c03 */
    /*0110*/         LD R3, [R3];                                    /* 0x800000000030dc85 */
    /*0118*/         ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;  /* 0x188e40005021dc03 */
    /*0120*/         IADD R0, R3, R0;                                /* 0x4800000000301c03 */
    /*0128*/     @P0 BRA 0x100;                                      /* 0x4003ffff400001e7 */
    /*0130*/         MOV R2, c[0xe][0x8];                            /* 0x2800780020009de4 */
    /*0138*/         ST [R2], R0;                                    /* 0x9000000000201c85 */
    /*0140*/         EXIT;                                           /* 0x8000000000001de7 */
    .....................................

可以看出，的指令justthreadfencev()严格包含在的指令中usethreadfence()，而usetwokernels_1()和的指令实际上是的指令的usetwokernels_2()划分justthreadfencev()。因此，时间上的差异可以归因于第二个内核的内核启动开销。

cuda - CUDA __threadfence() 同步与两个单独内核调用同步的性能

1 回答 1

Related

Reference