image - cuda中的Sobel滤波器（无法显示完整图像）

Question

我有一个关于使用 CUDA 的 sobel 滤波器输出的经典问题。

这是一个主类（main.cpp）

/*main class */
int main(int argc, char** argv)
{
   IplImage* image_source = cvLoadImage("test.jpg",     
                     CV_LOAD_IMAGE_GRAYSCALE);
   IplImage* image_input = cvCreateImage(cvGetSize(image_source),
                      IPL_DEPTH_8U,image_source->nChannels);
   IplImage* image_output = cvCreateImage(cvGetSize(image_source),
                      IPL_DEPTH_8U,image_source->nChannels);

  /* Convert from IplImage tofloat */
  cvConvert(image_source,image_input);

  unsigned char *h_out = (unsigned char*)image_output->imageData;
  unsigned char *h_in =  (unsigned char*)image_input->imageData;

width     = image_input->width;
height    = image_input->height;
widthStep = image_input->widthStep;

sobel_parallel(h_in, h_out, width, height, widthStep);
cvShowImage( "CPU", image_output );
cvReleaseImage( &image_output );
waitKey(0);
}

这是 CUDA 文件 (kernel_gpu.cu)

__global__ void kernel ( unsigned char *d_in ,  unsigned char *d_out , int width ,
     int height, int widthStep ) {

int col = blockIdx . x * blockDim . x + threadIdx . x ;
int row = blockIdx . y * blockDim . y + threadIdx . y ;

int dx [3][3] = { -1 , 0 , 1 ,
                 -2 , 0 , 2 ,
                 -1 , 0 , 1};

int dy [3][3] = {1 ,2 ,1 ,
                 0 ,0 ,0 ,
                -1 , -2 , -1};

int s;
if( col < width && row < height)
    {
        int i = row;
        int j = col;
        // apply kernel in X direction
        int sum_x=0;
        for(int m=-1; m<=1; m++)
            for(int n=-1; n<=1; n++)
            {
            s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
            sum_x+=s*dx[m+1][n+1];
            }
        // apply kernel in Y direction
        int sum_y=0;
        for(int m=-1; m<=1; m++)
            for(int n=-1; n<=1; n++)
            {
            s=d_in[(i+m)*widthStep+j+n]; // get the (i,j) pixel value
            sum_y+=s*dy[m+1][n+1];
            }
        int sum=abs(sum_x)+abs(sum_y);
        if (sum>255)
            sum=255;
        d_out[i*widthStep+j]=sum; // set the (i,j) pixel value
    }

}
//   Kernel Calling Function

extern "C" void sobel_parallel( unsigned char* h_in,  unsigned char* h_out,
    int rows, int cols, int widthStep){

unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, rows*cols);
cudaMalloc((void**) &d_out, rows*cols);

cudaMemcpy(d_in, h_in, rows*cols*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid ((rows * cols) / 256.0);
    kernel<<<grid,block>>>(d_in, d_out, rows, cols, widthStep);

cudaMemcpy(h_out, d_out, rows*cols*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
}

错误：结果图像未完整显示，仅显示图像的一部分。

为什么结果（GPU）是这样的？（我尝试使用相同的函数进行 CPU 计算，没有问题）。

score 3 · Accepted Answer

您正在创建 1 维网格，同时在内核内部使用 2D 索引，它将仅覆盖 x 方向，并且仅过滤图像的前 16 行（因为块的高度为 16）。

dim3 grid ((rows * cols) / 256.0); //This is incorrect in current case

考虑创建二维网格，使其跨越图像的所有行。

dim3 grid ((cols + 15)/16, (rows + 15)/16);

score 1 · Accepted Answer

检查 widthand widthStep变量以查看它们是否实际上相等，因为在您的sobel_parallel函数中您隐含地假设这一点（这可能不是真的，因为您的数据是对齐的）。如果这不是真的代码

 cudaMalloc((void**) &d_in, rows*cols);

实际上会分配比必要更少的内存，因此您只会处理部分图像。使用会更好

cudaMalloc((void**) &d_in, rows*widthStep);

当然，根据需要调整其余代码。

你也在打电话

void sobel_parallel( unsigned char* h_in,  unsigned char* h_out,
int rows, int cols, int widthStep)

和

sobel_parallel(h_in, h_out, width, height, widthStep);

它与 cols 交换行，并在您调用内核时再次交换。当您使用上述建议时，这将导致问题。

image - cuda中的Sobel滤波器（无法显示完整图像）

2 回答 2

Related

Reference