-1

我尝试将 CUDA 与 Qt 一起使用来模糊图像。我使用 NPP 库,nppiFilterGauss_8u_C1R 效果很好

void cuda_npp_gauss_filter_qt(uchar* pSourceData, uchar* pResultData, const int &ImageLineStep, const int &ImageWidth, const int &ImageHeight)
{
       NppiSize oSizeROI;

       oSizeROI.width = ImageWidth;
       oSizeROI.height = ImageHeight;

       Npp32s SourceStep = ImageLineStep;
       Npp32s DestinationStep = ImageLineStep;

       size_t AllocationSizeInBytes = ImageLineStep * ImageHeight;
       Npp8u *pSource, *pDestination;

       cudaMalloc<Npp8u>(&pSource,AllocationSizeInBytes);
       cudaMalloc<Npp8u>(&pDestination,AllocationSizeInBytes);

       cudaMemcpy(pSource, pSourceData, AllocationSizeInBytes, cudaMemcpyHostToDevice);

       nppiFilterGauss_8u_C1R(pSource, SourceStep, pDestination, DestinationStep, oSizeROI, NPP_MASK_SIZE_15_X_15);

       cudaMemcpy(pResultData, pDestination, AllocationSizeInBytes, cudaMemcpyDeviceToHost);
}

但 nppiFilterGaussAdvanced_8u_C1R 损坏图像

void cuda_npp_gauss_filter_qt_advanced(uchar* pSourceData, uchar* pResultData, const int &ImageLineStep, const int &ImageWidth, const int &ImageHeight, const int &Radius)
{
      NppiSize oSizeROI;

        oSizeROI.width = ImageWidth;
        oSizeROI.height = ImageHeight;

        Npp32s SourceStep = ImageLineStep;
        Npp32s DestinationStep = ImageLineStep;

        size_t AllocationSizeInBytes = ImageLineStep * ImageHeight;
        Npp8u *pSource, *pDestination;

        cudaMalloc<Npp8u>(&pSource,AllocationSizeInBytes);
        cudaMalloc<Npp8u>(&pDestination,AllocationSizeInBytes);

        //-------------------------------------------------------

        int nFilterTaps = 2*((int)((float)ceil(Radius) + 0.5F)) + 1;

        //-------------------------------------------------------

        //-------------------------------------------------------
        //-------------- Gaussian kernel ------------------------

        double sigma = 0.3*((nFilterTaps-1)*0.5 - 1) + 0.8;

        cv::Mat GaussianKernel = cv::getGaussianKernel(nFilterTaps, sigma, CV_32F);

        //-------------------------------------------------------
        //-------------------------------------------------------

        cudaMemcpy(pSource, pSourceData, AllocationSizeInBytes, cudaMemcpyHostToDevice);

        nppiFilterGaussAdvanced_8u_C1R (pSource, SourceStep, pDestination, DestinationStep, oSizeROI, nFilterTaps, (Npp32f*)GaussianKernel.data);

        cudaMemcpy(pResultData, pDestination, AllocationSizeInBytes, cudaMemcpyDeviceToHost);
    }

我使用 OpenCV 创建高斯内核。

Xubuntu 16.04.1、Qt 5.7-1、CUDA 8.044、OpenCV 2.4.9。谢谢。

4

2 回答 2

4

NPP 功能需要在设备上分配内存。OpenCV Mat(GaussianKernel在这种情况下)默认分配在主机上。

所以下面这行代码就失效了。

nppiFilterGaussAdvanced_8u_C1R (pSource, SourceStep, pDestination, DestinationStep, oSizeROI, nFilterTaps, (Npp32f*)GaussianKernel.data);

(Npp32f*)GaussianKernel.data在传递给 NPP 功能之前,应将参数复制到设备。可以这样实现:

float* GaussianKernel_d;
size_t GaussianKernelBytes = GaussianKernel.step() * GaussianKernel.rows;

cudaMalloc<float>(&GaussianKernel_d, GaussianKernelBytes);
cudaMemcpy(GaussianKernel_d, GaussianKernel.data, GaussianKernelBytes, cudaMemcpyHostToDevice);

nppiFilterGaussAdvanced_8u_C1R (pSource, SourceStep, pDestination, DestinationStep, oSizeROI, nFilterTaps, GaussianKernel_d);
于 2016-12-05T06:58:59.310 回答
0

感谢帮助。现在可以了。

//-------------------------------------------------------
//-------------- Gaussian kernel ------------------------

   double sigma = 0.3*((nFilterTaps-1)*0.5 - 1) + 0.8;

   cv::Mat GaussianKernel = cv::getGaussianKernel(nFilterTaps, sigma, CV_32F);

   Npp32f* pGaussianKernel;

   size_t GaussianKernelBytes = GaussianKernel.step * GaussianKernel.rows;

   cudaMalloc<Npp32f>(&pGaussianKernel, GaussianKernelBytes);

   cudaMemcpy(pGaussianKernel, GaussianKernel.data, GaussianKernelBytes, cudaMemcpyHostToDevice);

 //-------------------------------------------------------
 //-------------------------------------------------------

 cudaMemcpy(pSource, pSourceData, AllocationSizeInBytes, cudaMemcpyHostToDevice);

 nppiFilterGaussAdvanced_8u_C1R (pSource, SourceStep, pDestination, DestinationStep, oSizeROI, nFilterTaps, pGaussianKernel);

 cudaMemcpy(pResultData, pDestination, AllocationSizeInBytes, cudaMemcpyDeviceToHost);
于 2016-12-05T11:24:32.340 回答