multithreading - CUDA 帮助实现具有 320*240 图像处理的内核函数

Question

我对 cuda 很陌生，正在尝试通过此代码读取 cuda 中的帧，但我遇到了一个推翻错误并得到黑色输出..这是我的代码..

BYTE *imageBuf = 0;
BYTE *maBuf = 0;
BYTE *fgBuf = 0;
BYTE *tempBuf = 0;

cudaMalloc((void **)&maBuf , m_imgWidth*m_imgHeight);
cudaMalloc((void **)&fgBuf , m_imgWidth*m_imgHeight);
cudaMalloc((void **)&imageBuf , m_imgWidth*m_imgHeight);
cudaMalloc((void **)&tempBuf , m_imgWidth*m_imgHeight);
cudaMalloc((void **)&m_pixel_ptr , m_imgWidth*m_imgHeight);

cudaMemcpy(m_pixel_ptr , m_pixelParam , m_imgWidth*m_imgHeight , cudaMemcpyHostToDevice);
cudaMemcpy(imageBuf , inImgBuf , m_imgWidth*m_imgHeight , cudaMemcpyHostToDevice);
cudaMemcpy(fgBuf , foregroundBUf , m_imgWidth*m_imgHeight , cudaMemcpyHostToDevice);
cudaMemcpy(maBuf , maskBuf , m_imgWidth*m_imgHeight , cudaMemcpyHostToDevice);
cudaMemcpy(tempBuf , foregroundBUf , m_imgWidth*m_imgHeight , cudaMemcpyHostToDevice);

kernel<<<16,20>>>(imageBuf, maBuf, fgBuf , 320 , 240 , m_pixel_ptr , tempBuf , 0);


cudaMemcpy (maskBuf, maBuf , m_imgWidth*m_imgHeight , cudaMemcpyDeviceToHost);
cudaMemcpy (foregroundBUf, fgBuf , m_imgWidth*m_imgHeight , cudaMemcpyDeviceToHost);
cudaMemcpy (inImgBuf, imageBuf , m_imgWidth*m_imgHeight , cudaMemcpyDeviceToHost);
cudaMemcpy (m_pixelParam , m_pixel_ptr , m_imgWidth*m_imgHeight , cudaMemcpyDeviceToHost);

cudaFree(m_pixel_ptr);
cudaFree(imageBuf);
cudaFree(maBuf);
cudaFree(tempBuf);
cudaFree(fgBuf);

这就是我调用内核函数的方式，这是我的内核函数

    __global__ void kernel(BYTE *inImgBuf, BYTE *maskBuf,  BYTE *foregroundBUf , int width , int height , PixelPara *m_pixelParam , BYTE *tmpBuffer , int j)
{


int m_IniWeight = 0.005;
int m_IniStd = 400.0;
int m_Threshold = 0.7;
int fgTh;

int thresholdRank;


for(int i = 0; i < width; i++)
{
        int tid = ((threadIdx.x + blockIdx.x * blockDim.x)*width + i);
        if (m_pixelParam[tid].m_validModels == 0) 
        {
            m_pixelParam[j*width+i].m_weight[0] = m_IniWeight;
            m_pixelParam[j*width+i].m_stdVar[0] = m_IniStd;
            m_pixelParam[j*width+i].m_priRank[0] = 0;
            m_pixelParam[j*width+i].m_mean[0] = (double)inImgBuf[j*width+i];
            m_pixelParam[j*width+i].m_validModels++;


        }
        else
        {
            thresholdRank = 0;
            fgTh = 0.0;             
            while (true)
            {
                fgTh += m_pixelParam[j*width+i].m_weight[m_pixelParam[j*width+i].m_priRank[thresholdRank]];                 

                if ((fgTh > m_Threshold) || (thresholdRank >= m_pixelParam[j*width+i].m_validModels - 1)) 
                    break;      

                thresholdRank++;
            }

                gmmImplementation (inImgBuf, maskBuf, foregroundBUf ,width , height , m_pixelParam , (j*width+i) ,tmpBuffer ,thresholdRank );




            foregroundBUf = tmpBuffer;


        }

}






}

任何人都可以在这里帮助我如何在 cuda 中处理 320 * 240 帧我正在尝试在 cuda 上实现 GMM 但失败了..任何帮助或指导..谢谢。

score 1 · Accepted Answer

问题似乎是您正在分配 m_pixel_ptr 的大小m_imgWidth*m_imgHeight，而它看起来应该是m_imgWidth*m_imgHeight * sizeof(PixelPara). 这可以解释为什么你会遇到缓冲区溢出。您可能正在尝试读取和写入超出数组末尾的方式。当然，您对 m_pixel_ptr 的 cudaMemcpy 调用也需要长度m_imgWidth*m_imgHeight * sizeof(PixelParam)，以便将整个缓冲区复制到设备和从设备复制。

另外， imageBuf 应该包含双精度字节还是仅包含单个字节？现在，您的代码只需从每个像素读取一个字节，inImgBuf然后将其转换为双精度，然后再将其存储到m_pixelParam[j*width+i].m_mean[0]. 如果它应该读取双精度，则需要分配和复制具有大小的 imageBuf，m_imgWidth*m_imgHeight * sizeof(double)并且需要在内核参数列表中声明inImgBuf为 a而不是 a 。double *BYTE *

multithreading - CUDA 帮助实现具有 320*240 图像处理的内核函数

1 回答 1

Related

Reference