1

我正在尝试使用 CUDA 创建负像,它使用与 CPU 计算相同的功能。

这是主要课程。

int main(int argc, char** argv)
{

    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input),
                    IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    width     = image_input->width;
    height    = image_input->height;
    widthStep = image_input->widthStep;
    channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);
    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);

}

这是 CUDA 类

__global__ void kernel ( unsigned char *d_in ,unsigned char* d_out, int width , int height, int widthStep, int channels) {
int x = blockIdx . x * blockDim . x + threadIdx . x ;
int y = blockIdx . y * blockDim . y + threadIdx . y ;

int s;

if( x < width && y < height){
    int i = y;
    int j = x;
        for(int k=0;k<channels;k++){
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

extern "C" void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels){

unsigned char* d_in;
unsigned char* d_out;
cudaMalloc((void**) &d_in, width*height);
cudaMalloc((void**) &d_out, width*height);

cudaMemcpy(d_in, h_in, width*height*sizeof( unsigned char), cudaMemcpyHostToDevice);
dim3 block (16,16);
dim3 grid (width/16, height/16);
kernel<<<grid,block>>>(d_in, d_out, width, height, widthStep, channels);

cudaMemcpy(h_out, d_out, width*height*sizeof( unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);

}

使用 CPU 计算完成后,负片图像成功。但是在使用CUDA的时候,负片不成功,只是出现了空白的白色图像。我的代码有什么问题?T_T

4

1 回答 1

3

你非常接近。只需将每个通道中的字节数添加到您的内存分配和传输中。这是您的代码的工作版本。我还添加了一些错误检查。有关错误检查的更多信息,请参阅此问题。请注意,在这种情况下,您不必在 GPU 上使用两个缓冲区。您可以使用单个缓冲区并就地进行转换。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace std;

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels);

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char** argv)
{
    IplImage* image_input = cvLoadImage("test.jpg", CV_LOAD_IMAGE_UNCHANGED);
    IplImage* image_output = cvCreateImage(cvGetSize(image_input), IPL_DEPTH_8U,image_input->nChannels);

    unsigned char *h_out = (unsigned char*)image_output->imageData;
    unsigned char *h_in =  (unsigned char*)image_input->imageData;

    int width     = image_input->width;
    int height    = image_input->height;
    int widthStep = image_input->widthStep;
    int channels  = image_input->nChannels;

    negatif_parallel(h_in, h_out,  width, height, widthStep, channels);

    cvShowImage("Original", image_input);
    cvShowImage("CPU", image_output);

    waitKey(0);

    cvReleaseImage(&image_input);
    cvReleaseImage(&image_output);
}

__global__ void kernel (unsigned char *d_in,unsigned char* d_out, int width, int height, int widthStep, int channels) {
    int x = blockIdx . x * blockDim . x + threadIdx . x ;
    int y = blockIdx . y * blockDim . y + threadIdx . y ;

    int s;

    if (x < width && y < height) {
        int i = y;
        int j = x;
        for(int k=0; k< channels; k++) {
            s = d_in[i*widthStep + j*channels + k];
            s = 255-d_in[i*widthStep + j*channels + k];
            d_out[i*widthStep + j*channels + k]=s;
        }

    }
}

void negatif_parallel( unsigned char* h_in, unsigned char* h_out,  int width, int height, int widthStep,int channels)
{
    unsigned char* d_in;
    unsigned char* d_out;
    cudaMalloc((void**) &d_in, width*height*channels);
    cudaMalloc((void**) &d_out, width*height*channels);

    gpuErrchk(cudaMemcpy(d_in, h_in, width*height*channels, cudaMemcpyHostToDevice));
    dim3 block (16,16);
    dim3 grid (width / 16, height /16);
    kernel<<<grid, block>>>(d_in, d_out, width, height, widthStep, channels);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() ); // Not strictly required because the next call, cudaMemcpy, is blocking

    gpuErrchk(cudaMemcpy(h_out, d_out, width * height * channels, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));
}
于 2013-11-12T17:16:57.803 回答