0

最近我在学习 JASON SANDERS 的 CUDA 一书中的例子。Juila Set 的例子造成了 7032ms 的糟糕表现。这是程序:

#include <cuda.h>
#include <cuda_runtime.h>
#include <cpu_bitmap.h>
#include <book.h>
#define DIM 1024

struct cuComplex{
    float r;
    float i;
    __device__ cuComplex(float a, float b) : r(a),i(b){

    }
    __device__ float magnitude2(void){
        return r*r+i*i;
    }
    __device__ cuComplex operator *(const cuComplex& a){
        return cuComplex(r*a.r-i*a.i, i*a.r+r*a.i);
    }
    __device__ cuComplex operator +(const cuComplex& a){
        return cuComplex(r+a.r,i+a.i);
    }
};

__device__ int julia(int x,int y){
    const float scale = 1.5;
    float jx = scale * (float)(DIM/2 - x)/(DIM/2);
    float jy = scale * (float)(DIM/2 - y)/(DIM/2);

    cuComplex c(-0.8,0.156);
    cuComplex a(jx,jy);

    int i = 0;
    for(i = 0; i<200; i++){
        a = a*a + c;
        if(a.magnitude2() > 1000){
            return 0;
        }
    }
    return 1;
}

__global__ void kernel(unsigned char *ptr){
    int x = blockIdx.x;
    int y = blockIdx.y;
    int offset = x + y*gridDim.x;

    int juliaValue = julia(x,y);
    ptr[offset*4 + 0] = 255*juliaValue;
    ptr[offset*4 + 1] = 0;
    ptr[offset*4 + 2] = 1;
    ptr[offset*4 + 3] = 255;

}




int main(void){

    CPUBitmap bitmap(DIM,DIM);
    unsigned char * dev_bitmap;

    dim3 grid(DIM,DIM);
    dim3 blocks(DIM/16,DIM/16);
    dim3 threads(16,16);
    dim3 thread(DIM,DIM);
    cudaEvent_t start,stop;
    cudaEvent_t bitmapCpy_start,bitmapCpy_stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));
    HANDLE_ERROR(cudaEventCreate(&bitmapCpy_start));
    HANDLE_ERROR(cudaEventCreate(&bitmapCpy_stop));

    HANDLE_ERROR(cudaMalloc((void **)&dev_bitmap,bitmap.image_size()));

    HANDLE_ERROR(cudaEventRecord(start,0));


    kernel<<<grid,1>>>(dev_bitmap);

    HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(),dev_bitmap,bitmap.image_size(),cudaMemcpyDeviceToHost));
    //HANDLE_ERROR(cudaEventRecord(bitmapCpy_stop,0));
    //HANDLE_ERROR(cudaEventSynchronize(bitmapCpy_stop));
//  float copyTime;
//  HANDLE_ERROR(cudaEventElapsedTime(&copyTime,bitmapCpy_start,bitmapCpy_stop));

    HANDLE_ERROR(cudaEventRecord(stop,0));
    HANDLE_ERROR(cudaEventSynchronize(stop));
    float elapsedTime;
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,start,stop));



    //printf("Total time is %3.1f ms, time for copying is %3.1f ms \n",elapsedTime,copyTime);
    printf("Total time is %3.1f ms\n",elapsedTime);

    bitmap.display_and_exit();
    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    HANDLE_ERROR(cudaEventDestroy(bitmapCpy_start));
    HANDLE_ERROR(cudaEventDestroy(bitmapCpy_stop));
    HANDLE_ERROR(cudaFree(dev_bitmap));
}

我认为影响性能的主要因素是上面的程序在每个块中只运行 1 个线程:

 kernel<<<grid,1>>>(dev_bitmap);

所以我改变内核如下:

__global__ void kernel(unsigned char *ptr){

    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;

    int offset = x + y*gridDim.x*blockIdx.x;

    int juliaValue = julia(x,y);
    ptr[offset*4 + 0] = 255*juliaValue;
    ptr[offset*4 + 1] = 0;
    ptr[offset*4 + 2] = 1;
    ptr[offset*4 + 3] = 255;

}

并调用内核:

dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>(dev_bitmap);

我觉得这个变化没什么大不了的,但是当我运行它时,它就像是陷入了无限循环,没有图像出现,我的屏幕也无法做任何事情,只是卡在那里。

工具包:cuda 5.5

系统:ubuntu 12.04

4

1 回答 1

2

当我运行您在此处发布的原始代码时,我得到了正确的显示和大约 340 毫秒的时间。

当我更改内核时,我在内核启动时收到“未指定的启动错误”。

在您修改后的内核中,您有以下不正确的计算:

int offset = x + y*gridDim.x*blockIdx.x;

当我将其更改为:

int offset = x + y*gridDim.x*blockDim.x;

我得到了正常的执行和结果,以及大约 10 毫秒的指示时间。

于 2013-10-30T16:37:21.537 回答