c - C/CUDA - 修改 CUDA/GL 互操作示例以将图像存储在内存缓冲区中

Question

我正在尝试将由'CUDA-By Example'教科书中的CUDA-OpenGL互操作示例生成的图像存储到可以存储到图像的内存缓冲区中。

我想在内存缓冲区中存储两个图像，一个是绿色的“X”，另一个是橙色的“X”。当我用 OpenGL 渲染 pBuffer 时，我应该得到一个像示例输出一样的绿色“X”图像，但是，我只是得到一个黑屏。我不确定为什么我没有得到正确的输出。有人可以告诉我有什么问题吗？

我从A Memory buffer for multiple images获得了内存缓冲区的代码

    #include "book.h"
    #include "cpu_bitmap.h"
    #include "cuda.h"
    #include <cuda_gl_interop.h>

    PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
    PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
    PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
    PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

    #define     DIM    512

    #define IMAGESIZE_MAX (DIM*DIM)  // MY CHANGE

    GLuint  bufferObj;
    cudaGraphicsResource *resource;

    // based on ripple code, but uses uchar4 which is the type of data
    // graphic inter op uses. see screenshot - basic2.png

    __global__ void kernel( uchar4 *ptr1)   
    {
   // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr1[offset].x = 0;
    ptr1[offset].y = green;
    ptr1[offset].z = 0;
    ptr1[offset].w = 255;

    }

    // MY CODE

    __global__ void kernel2( uchar4 *ptr2) 
    {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x ;

    // now calculate the value at that position
    float fx = x/(float)DIM - 0.5f;
    float fy = y/(float)DIM - 0.5f;
    unsigned char   green = 128 + 127 * tan( abs(fx*100) - abs(fy*100) );

    // accessing uchar4 vs unsigned char*
    ptr2[offset].x = 1000;
    ptr2[offset].y = green;
    ptr2[offset].z = 0;
    ptr2[offset].w = 255;
    }

    __global__ void copy ( uchar4 *pBuffer, uchar4 *Ptr, uchar4 *Ptr2, size_t size, int a )
    {

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = x + y * blockDim.x * gridDim.x ;
    int bdx = idx;

    if (a==1)
    {
    while ( idx < DIM*DIM)
    {
    pBuffer[idx] = Ptr[idx] ;
    __syncthreads();
    if (idx==DIM*DIM)
    {
     break;
    }
    }
    }


    if (a==2)
    {
    while ( (idx < DIM*DIM) && (bdx < DIM*DIM) )
    {
    uchar4 temp = Ptr2[bdx];
    __syncthreads();

    pBuffer[idx+4] = temp;
    __syncthreads();

    if ((idx==DIM*DIM) && (bdx==DIM*DIM))
    {
    break;
    }
    }  
    }

    }


    void key_func( unsigned char key, int x, int y ) 
    {
    switch (key) 
    {
    case 27:
        // clean up OpenGL and CUDA
        ( cudaGraphicsUnregisterResource( resource ) );
        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
        glDeleteBuffers( 1, &bufferObj );
        exit(0);
    }
    }

    void draw_func( void ) {
    // we pass zero as the last parameter, because out bufferObj is now
    // the source, and the field switches from being a pointer to a
    // bitmap to now mean an offset into a bitmap object
    glDrawPixels( DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
    glutSwapBuffers();
    }


    int main( int argc, char **argv ) 
    {
    cudaDeviceProp  prop;
    int dev;

    memset( &prop, 0, sizeof( cudaDeviceProp ) );
    prop.major = 1;
    prop.minor = 0;
    ( cudaChooseDevice( &dev, &prop ) );

    // tell CUDA which dev we will be using for graphic interop
    // from the programming guide:  Interoperability with OpenGL
    //     requires that the CUDA device be specified by
    //     cudaGLSetGLDevice() before any other runtime calls.

    ( cudaGLSetGLDevice( dev ) );

    // these GLUT calls need to be made before the other OpenGL
    // calls, else we get a seg fault
    glutInit( &argc, argv );
    glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
    glutInitWindowSize( DIM, DIM );
    glutCreateWindow( "bitmap" );

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // the first three are standard OpenGL, the 4th is the CUDA reg 
    // of the bitmap these calls exist starting in OpenGL 1.5
    glGenBuffers( 1, &bufferObj );
    glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
    glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * 4 ,NULL, GL_DYNAMIC_DRAW_ARB );

    // REGISTER THE GL BufferObj and CUDA Resource
    ( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );

    // do work with the memory dst being on the GPU, gotten via mapping
    HANDLE_ERROR( cudaGraphicsMapResources( 1, &resource, NULL ) );

    // MY MODIFIED CODE

    uchar4 *devPtr; 
    size_t size;
    size_t sizeTotal = 0;
    cudaMalloc ( (uchar4 **)&devPtr,  size);

    uchar4 *devPtr2; 

    cudaMalloc ( (uchar4 **)&devPtr2,  size); 

    uchar4 *pBuffer;

    (cudaMalloc ( (uchar4 **)&pBuffer,  size));

    uchar4 *pBufferCurrent;

    (cudaMalloc ( (uchar4 **)&pBufferCurrent,  size));

    uchar4 *pBufferImage;

    (cudaMalloc ( (uchar4 **)&pBufferImage,  size));

    // REGISTER THE C BUFFER and CUDA Resource

    HANDLE_ERROR( 
    cudaGraphicsResourceGetMappedPointer( (void**)&pBufferImage, 
                                          &size, 
                                          resource) );

    dim3    grids(DIM/16,DIM/16);
    dim3    threads(16,16);


    kernel<<<grids,threads>>>(devPtr);
    kernel2<<<grids,threads>>>(devPtr2);

    int a = 1;
    do 
    {

    if (a==1)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);  
    }

    if(a==2)
    {
    copy<<< grids, threads>>>(pBufferImage, devPtr, devPtr2, size, a);
    }

    a++;

    } while (a<=2); 

    cudaGraphicsUnmapResources( 1, &resource, NULL ) );

    // set up GLUT and kick off main loop
    glutKeyboardFunc( key_func );
    glutDisplayFunc( draw_func );
    glutMainLoop();
   }

score 2 · Accepted Answer

首先对所有 cuda API 调用（例如 cudaMemcpy 等）和内核调用进行适当的cuda 错误检查。

当你这样做时，你会发现你的内核没有成功运行。这些类型的东西不起作用：

uchar4 *devPtr;   // you've just created an unallocated NULL host pointer
size_t img1_size = IMAGESIZE_MAX;

kernel<<<grids,threads>>>(devPtr);  // this kernel will fail


uchar4 *devPtr2;   // you've just created an unallocated NULL host pointer
size_t img2_size = IMAGESIZE_MAX;

kernel2<<<grids,threads>>>(devPtr2);  // this kernel will fail

devPtr并且devPtr2在上面的代码中是 NULL 指针。您尚未分配与它们关联的任何存储空间。此外，由于您将它们传递给设备内核，因此需要为它们分配cudaMalloc或类似的 API 函数，以便指针在设备代码中可用。

由于它们不是用分配的cudaMalloc，所以一旦您尝试在设备代码中取消引用这些指针，您就会创建内核错误。如果您进行错误检查，这将很明显，因为您将收到“未指定的启动失败”或来自这些内核的类似报告。

我认为您的代码中可能还有许多其他问题，但首先您应该进行适当的 cuda 错误检查，至少让您的代码达到您所编写的所有内容实际上都在运行的程度。

而且您发布的代码实际上并没有编译。

修复编译错误后，我还发现您还有另一个无限循环：

cudaMalloc ( (uchar4 **)&pBufferCurrent,  sizeTotal + sizeof(size) + size); 
cudaMalloc ( (uchar4 **)&pBuffer,  sizeTotal + sizeof(size) + size);

do 
{

if (!pBufferCurrent)
{
break;
}

pBuffer = pBufferCurrent;

pBufferCurrent += sizeTotal;

imageget ( pBufferCurrent + sizeof(size), size, devPtr);

sizeTotal += (sizeof(size) + size);

} while (a==1);

由于a在循环中初始化为 1 ，并且循环中没有任何内容修改a，因此循环将永远不会根据while条件退出。由于 pBufferCurrent 如果由正确设置，它也永远不会为零cudaMalloc，因此break永远不会被采用。

如果你malloc或cudaMalloc一个名为的指针pBufferCurrent，我很难想象在什么情况下这会有意义：

pBufferCurrent += sizeTotal;

虽然这是合法的，但我不明白这有什么意义：

pBuffer = pBufferCurrent;

pBuffer您刚刚为using创建了一个分配cudaMalloc，但您要做的第一件事就是把它扔掉？

c - C/CUDA - 修改 CUDA/GL 互操作示例以将图像存储在内存缓冲区中

1 回答 1

Related

Reference