c++ - CUDA 内核与 Julia 集的 CPU 版本相比性能缓慢

Question

我正在从“CUDA by example”一书中学习 CUDA 。在第 4 章中，有一个生成 Julia 分形的演示。该展示展示了 CPU 和 GPU 版本。我决定添加一个时间来查看这两种情况的执行速度和令我惊讶的是，CPU 版本的执行速度比 GPU 快 3 倍。

CPU Julia 生成总时间：

745 毫秒。

GPU Julia 生成总时间：

2456 毫秒。

那么发生了什么？很明显，至少从 CUDA 内核代码来看，执行是并行的，因为分布在 1000 个块中，每个块计算 1000x1000 分辨率最终图像的像素。

下面是实现的源代码：

 #define N 10
 #define DIM 1000
 typedef unsigned char byte;

struct cuComplex {
   float   r;
   float   i;
   __host__ __device__ cuComplex( float a, float b ) : r(a), i(b)  {}
   __host__  __device__ float magnitude2( void ) {
          return r * r + i * i;
   }
   __host__ __device__ cuComplex operator*(const cuComplex& a) {
        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
   }
    __host__ __device__ cuComplex operator+(const cuComplex& a) {
        return cuComplex(r+a.r, i+a.i);
   }
};

 __device__ int juliaGPU(int x , int y){
    const float scale =1.3;
    float jx = scale * (float)(DIM/2 -x)/(DIM/2);
    float jy=  scale *(float)(DIM/2 -y)/(DIM/2);

    cuComplex c(-0.8 ,0.156);
    cuComplex a(jx ,jy);
    int i = 0;
    for(i=0; i <200;i++){
        a = a * a +c;
        if(a.magnitude2() >1000){

            return 0;
        }
    }
    return 1;

 }

 __global__ void kernelGPU(byte *ptr){
    int x = blockIdx.x;
    int y = blockIdx.y;
    int offset =x + y * gridDim.x;

    int juliaValue =juliaGPU(x , y);
    ptr[offset * 4 + 0]=255 * juliaValue;
    ptr[offset * 4 + 1]=0;
    ptr[offset * 4 + 2]=0;
    ptr[offset * 4 + 3]=255 ;
}


 struct DataBlock {
    unsigned char   *dev_bitmap;
};
 void juliaGPUTestSample(){
 DataBlock   data;
CPUBitmap bitmap(DIM,DIM);
byte *dev_bitmap; //memory on GPU 
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap , bitmap.image_size()) );
data.dev_bitmap =dev_bitmap;
dim3 grid(DIM,DIM);
int starTime=glutGet(GLUT_ELAPSED_TIME);

kernelGPU<<<grid ,1 >>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr() , dev_bitmap ,bitmap.image_size() ,cudaMemcpyDeviceToHost ) );
int endTime=glutGet(GLUT_ELAPSED_TIME)-starTime;
printf("Total time %d\n:" ,endTime);
HANDLE_ERROR(cudaFree(dev_bitmap));

bitmap.display_and_exit();
 }

 int main(void){
juliaGPUTestSample();
return 1;

 }

这是CPU版本：

/// “cuComplex” 结构与上面的相同。

int julia (int x , int y){

const float scale = 1.3;
float jx = scale * (float)(DIM/2 -x)/(DIM/2);
float jy = scale * (float)(DIM/2 -y)/(DIM/2);

cuComplex c(-0.8 ,0.156);
cuComplex a(jx ,jy);

int i = 0;
for(i=0; i <200;i++){

    a = a * a +c;
    if(a.magnitude2() >1000){

        return 0;
    }
}

return 1;

}

void kernel(unsigned char *ptr){

for(int y = 0 ; y <DIM ;++y){
    for(int x = 0 ; x <DIM ; ++x){
        int offset =x + y * DIM;
        int juliaValue = julia(x , y);

        ptr[offset * 4 + 0 ] = juliaValue * 125;
        ptr[offset * 4 + 1 ] = juliaValue * x;
        ptr[offset * 4 + 2 ] = juliaValue * y;
        ptr[offset * 4 + 3 ] = 255 ;
    }
}

}
void juliaCPUTestSample(){

CPUBitmap bitmap(DIM ,DIM);
unsigned char *ptr = bitmap.get_ptr();
int starTime=glutGet(GLUT_ELAPSED_TIME);

kernel(ptr);

int endTime=glutGet(GLUT_ELAPSED_TIME)-starTime;
printf("Total time %d\n:" ,endTime);
bitmap.display_and_exit();

}

更新系统配置：

视窗 7 64 位

CPU - Intel i7 -3770CPU 3.40GHz ,16GB RAM

GPU - 英伟达 Quadro 4000

score 9 · Accepted Answer

其他人已经注意到了这一点。

首先，在谈到 CPU 和 GPU 之间的性能比较时，最好提及系统配置，包括硬件平台和软件。例如，我在一台配备核心 i7 2.60GHz 四核 CPU 和 quadro1000M GPU、运行 RHEL 6.2 和 cuda 5.0 的 HP 笔记本电脑上运行您的代码，我的 GPU 得分为 438，CPU 得分为 441。

其次，更重要的是，那本书中的 julia 示例是一个相对较早的 CUDA 编码示例，因此它并不是真正面向最大性能，而是为了说明到目前为止已经讨论过的概念。该书和其他各种 CUDA 教程材料首先介绍了在块级别使用 CUDA 进行并行编程。这一点的迹象在这里：

kernelGPU<<<grid ,1 >>>(dev_bitmap);

内核启动参数<<<grid, 1>>>表明将启动一个包含一定数量（grid，在本例中为 100 万个总块）块的网格，每个块都有一个线程。这立即降低了 Fermi 级 GPU 的功率，例如，与启动每个线程块具有完整线程的网格相比，降低了 1/32 倍。Fermi 级 GPU 中的每个 SM 都有 32 个线程处理器，所有处理器都同步执行。如果您启动一个只有 16 个线程的块，那么 16 个线程处理器将执行您的代码，而其他 16 个线程处理器将什么都不做（即没有任何用处）。因此，仅包含 1 个线程的线程块将仅使用 32 个线程处理器中的 1 个，其他 31 个处于空闲状态。

因此，这个特定的代码示例没有经过精心设计以利用 GPU 的完全并行能力。鉴于书中对CUDA概念的阐述相对较早，这是可以理解的；我不相信作者的意图是让这些代码进行基准测试或用作如何在 GPU 上编写快速代码的合法表示。

考虑到这个 1/32 的因素，在您的系统上 CPU 的速度只有 3 倍，而在我的系统上 CPU 和 GPU 具有相当的吞吐量（这些都不是特别高性能的 CUDA GPU，很可能）我认为它以相当不错的方式显示了 GPU。GPU 正在与大约 97% 的功能未使用的情况下进行战斗。

c++ - CUDA 内核与 Julia 集的 CPU 版本相比性能缓慢

1 回答 1

Related

Reference