0

nVidia Cuda nppiResize_32f_C1R 在灰度 1 x 32f 上工作正常,但 nppiResize_32f_C3R 返回垃圾。显然,一种解决方法是调用此例程 3 次,首先将数据解交织为平面 R、G、B,但我希望能够一次运行它。nVidia 有很多用于单平面图像处理的示例代码,但交错颜色平面的数量很少,所以我转向 stackoverflow 寻求帮助。我不知道如何计算步幅,但我知道步幅是图像宽度乘以每列索引的字节数。所以在我的情况下 - 没有填充线 - 对于 RGB,它的宽度应该是 32f x 3。

在 cudaMemcpy2D() 中尝试了不同的步幅/音高。无法为颜色 RGB 代码找到可行的解决方案。编译运行正常,没有错误。第一部分用于灰度(工作正常)。第二部分是RGB(垃圾)。

    // nppiResize using 2D aligned allocations

    #include <Exceptions.h>
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>

    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)

    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep; 
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};     
            NppiRect oDstROI = {0, 0, nDstW, nDstH}; 
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep; 
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH}; 
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }
4

1 回答 1

1

您的调用中有各种错误cudaMemcpy2D(两者都在 3 通道代码中)。这段代码似乎对我有用:

$ cat t1521.cu
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>
    #include <iostream>
    #include <stdint.h>
    #include <stdio.h>
    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
    using namespace std;
    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep;
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep;
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }

int main(){
    uint32_t nSrcH = 480;
    uint32_t nSrcW = 640;
    uint8_t byteperpixel = 3;
    float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
    for (int i = 0; i < nSrcH*nSrcW; i++){
      readbuff [i*3+0] = 1.0f;
      readbuff [i*3+1] = 2.0f;
      readbuff [i*3+2] = 3.0f;}
    uint32_t nDstW = nSrcW/2;
    uint32_t nDstH = nSrcH/2;
    float *res =  decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
    for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
    return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

将来,如果您提供完整的代码,就像我在回答中所做的那样,它会很方便。事实上,SO 需要这个,请参见此处的第 1 项。

顺便说一句,使用任何现代 GPU 和 CUDA 版本,在设备上使用倾斜分配(这里会引入您无法解决的复杂性)对于正确性和性能来说确实是不必要的。普通的线性/平面分配,其中间距==宽度,应该没问题。

于 2019-09-30T04:58:13.820 回答