最近我在学习 JASON SANDERS 的 CUDA 一书中的例子。Juila Set 的例子造成了 7032ms 的糟糕表现。这是程序:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cpu_bitmap.h>
#include <book.h>
#define DIM 1024
struct cuComplex{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a),i(b){
}
__device__ float magnitude2(void){
return r*r+i*i;
}
__device__ cuComplex operator *(const cuComplex& a){
return cuComplex(r*a.r-i*a.i, i*a.r+r*a.i);
}
__device__ cuComplex operator +(const cuComplex& a){
return cuComplex(r+a.r,i+a.i);
}
};
__device__ int julia(int x,int y){
const float scale = 1.5;
float jx = scale * (float)(DIM/2 - x)/(DIM/2);
float jy = scale * (float)(DIM/2 - y)/(DIM/2);
cuComplex c(-0.8,0.156);
cuComplex a(jx,jy);
int i = 0;
for(i = 0; i<200; i++){
a = a*a + c;
if(a.magnitude2() > 1000){
return 0;
}
}
return 1;
}
__global__ void kernel(unsigned char *ptr){
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y*gridDim.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
int main(void){
CPUBitmap bitmap(DIM,DIM);
unsigned char * dev_bitmap;
dim3 grid(DIM,DIM);
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
dim3 thread(DIM,DIM);
cudaEvent_t start,stop;
cudaEvent_t bitmapCpy_start,bitmapCpy_stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_start));
HANDLE_ERROR(cudaEventCreate(&bitmapCpy_stop));
HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));
HANDLE_ERROR(cudaEventRecord(start,0));
kernel<<<grid,1>>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
//HANDLE_ERROR(cudaEventRecord(bitmapCpy_stop,0));
//HANDLE_ERROR(cudaEventSynchronize(bitmapCpy_stop));
// float copyTime;
// HANDLE_ERROR(cudaEventElapsedTime(©Time,bitmapCpy_start,bitmapCpy_stop));
HANDLE_ERROR(cudaEventRecord(stop,0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));
//printf("Total time is %3.1f ms, time for copying is %3.1f ms \n",elapsedTime,copyTime);
printf("Total time is %3.1f ms\n",elapsedTime);
bitmap.display_and_exit();
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_start));
HANDLE_ERROR(cudaEventDestroy(bitmapCpy_stop));
HANDLE_ERROR(cudaFree(dev_bitmap));
}
我认为影响性能的主要因素是上面的程序在每个块中只运行 1 个线程:
kernel<<<grid,1>>>(dev_bitmap);
所以我改变内核如下:
__global__ void kernel(unsigned char *ptr){
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int offset = x + y*gridDim.x*blockIdx.x;
int juliaValue = julia(x,y);
ptr[offset*4 + 0] = 255*juliaValue;
ptr[offset*4 + 1] = 0;
ptr[offset*4 + 2] = 1;
ptr[offset*4 + 3] = 255;
}
并调用内核:
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>(dev_bitmap);
我觉得这个变化没什么大不了的,但是当我运行它时,它就像是陷入了无限循环,没有图像出现,我的屏幕也无法做任何事情,只是卡在那里。
工具包:cuda 5.5
系统:ubuntu 12.04