2

我正在 GTX680 上进行一些性能 CUDA 测试,想知道是否有人可以帮助我理解为什么我会得到以下性能结果。我正在运行的代码如下:

#include <stdio.h>
using namespace std;


__global__ void test_hardcoded(int rec,int * output)
{

    int a;
    int rec2=rec/2;
    if(threadIdx.x==1000) *output=rec;
    if(threadIdx.x==1000) *(output+1)=rec2;

    for (int i=0;i<10000;i++)
    {
        __syncthreads();
        a+=i;
    }
    if(threadIdx.x==1000) *output=a;   //will never happen but should fool compiler as to not skip the for loop

}
__global__ void test_softcoded(int rec,int * output)
{
    int a;
    int rec2=rec/2; //This should ensure that we are using the a register not constant memory
    if(threadIdx.x==1000) *output=rec;
    if(threadIdx.x==1000) *(output+1)=rec2;

    for (int i=0;i<=rec2;i++)
    {    __syncthreads();
        a+=i;
    }
    if(threadIdx.x==1000) *output=a;   //will never happen but should fool compiler as to not skip the for loop

}

int main(int argc, char *argv[])
{
    float timestamp;
    cudaEvent_t event_start,event_stop;
    // Initialise
    cudaSetDevice(0);

    cudaEventCreate(&event_start);
    cudaEventCreate(&event_stop);
    cudaEventRecord(event_start, 0);
    dim3 threadsPerBlock;
    dim3 blocks;
    threadsPerBlock.x=32;
    threadsPerBlock.y=32;
    threadsPerBlock.z=1;
    blocks.x=1;
    blocks.y=1000;
    blocks.z=1;

    cudaEventRecord(event_start);
    test_hardcoded<<<blocks,threadsPerBlock,0>>>(10000,NULL);
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test_hardcoded() took  %fms \n", timestamp);

    cudaEventRecord(event_start);
    test_softcoded<<<blocks,threadsPerBlock,0>>>(20000,NULL);
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test_softcoded() took  %fms \n", timestamp);

}

根据代码,我正在运行两个内核。他们所做的只是循环和添加。唯一的区别是 test_softcoded() 循环与寄存器进行比较,而 test_hardcoded() 直接与硬编码整数进行比较。

当我运行上面的代码时,我得到以下结果

$ nvcc -arch=sm_30 test7.cu
$ ./a.out

test_hardcoded() took  51.353985ms 
test_softcoded() took  99.209694ms 

test_hardcoded() 函数比 test-softcoded() 快两倍!!!!

我知道在 test_softcoded() 中存在写入注册表依赖项后潜在的读取,但我的意识是注册表延迟对于高占用率是完全隐藏的,并且应该非常高),所以我想知道可能是什么问题以及要做什么这样做可以提高 test_softcoded() 的性能。

4

1 回答 1

1

Due to this hard coded value, compiler can do some optimizations, like loop unrolling, which may increase the performance by some amount. that may be the reason.

You can check it by adding some unrolling to for loop in "test_softcoded" like Adding code like '#pragma unroll 5000' before 'for (int i=0;i<=rec2;i++)' and running it will solve your doubt.

于 2012-10-22T16:25:58.880 回答