0

I wrote a foreground extraction kernel to use in Matlab, the thing wasn't printing anything so I ported it to pure Cuda C and took most of the logic out. This thing isn't doing anything, not even printing the cuPrintf statements before the return, any idea why?

#include <cuda.h>
#include <stdio.h>      /* printf, scanf, NULL */
#include <stdlib.h>     /* calloc, exit, free */
#include "cuPrintf.cu"
#include "utils.h" 
#include <time.h>       /* clock_t, clock, CLOCKS_PER_SEC */



__global__ void foreground_extract(      unsigned char* inputImageRed,
                                         unsigned char* inputImageGreen,
                                         unsigned char* inputImageBlue,

                                         unsigned char* outputImageRed,
                                         unsigned char* outputImageGreen,
                                         unsigned char* outputImageBlue,                                          

                                         const int xDim, 
                                         const int yDim)
{


    cuPrintf("print something \n");
    //x = col, y = row
    //xDim = col_dim, yDim = row_dim
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y *blockDim.x *gridDim.x;

    int nnodes = xDim*yDim;
    if (offset >= nnodes) return;


    //test equality

    outputImageRed[offset] = inputImageRed[offset];
    outputImageGreen[offset] = inputImageGreen[offset];
    outputImageBlue[offset] = inputImageBlue[offset];

    cuPrintf("print something here too \n");
    cuPrintf("%d \n", outputImageRed[offset]);

}

int main()
{

        int xDim = 3;
        int yDim = 3;

                                         unsigned char* h_inputImageRed;
                                         unsigned char* h_inputImageGreen;
                                         unsigned char* h_inputImageBlue;

                                         unsigned char* h_outputImageRed;
                                         unsigned char* h_outputImageGreen;
                                         unsigned char* h_outputImageBlue;


                    h_inputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_inputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_inputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));

                    h_outputImageRed = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_outputImageGreen = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));
                    h_outputImageBlue = (unsigned char*) calloc ((xDim*yDim), sizeof(unsigned char));


      //initiate input only 
      unsigned char init =0;
      for (int i=0; i<(xDim*yDim);i++){

                                          h_inputImageRed[i] = init;
                                          h_inputImageGreen[i] = init;
                                          h_inputImageBlue[i] = init;

                                          init++;

                                          printf("%d\n", h_inputImageRed[i]);

      }

                                         //device arrays
                                         unsigned char* d_inputImageRed;
                                         unsigned char* d_inputImageGreen;
                                         unsigned char* d_inputImageBlue;

                                         unsigned char* d_outputImageRed;
                                         unsigned char* d_outputImageGreen;
                                         unsigned char* d_outputImageBlue;


     //cudaMallocs

     checkCudaErrors(cudaMalloc((void**)&d_inputImageRed, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_inputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_inputImageBlue, (sizeof(unsigned char)*xDim*yDim)));

     checkCudaErrors(cudaMalloc((void**)&d_outputImageRed, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim)));
     checkCudaErrors(cudaMalloc((void**)&d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim)));

     //cudaMemcpys, Host to Device

     checkCudaErrors(cudaMemcpy(d_inputImageRed, h_inputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_inputImageGreen, h_inputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_inputImageBlue, h_inputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));

     checkCudaErrors(cudaMemcpy(d_outputImageRed, h_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_outputImageGreen, h_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(d_outputImageBlue, h_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyHostToDevice));

     cudaPrintfInit();

     int gridSizeX = ceil(float(xDim/8));
     int gridSizeY = ceil(float(yDim/8));
     int gridSizeZ = 1;

     int blockSizeX=8;
     int blockSizeY=8;
     int blockSizeZ=1;

     const dim3 gridSize(gridSizeX,gridSizeY,gridSizeZ);
     const dim3 blockSize(blockSizeX,blockSizeY,blockSizeZ);

     foreground_extract <<< gridSize, blockSize >>>(d_inputImageRed,
                                                    d_inputImageGreen,
                                                    d_inputImageBlue,

                                                    d_outputImageRed,
                                                    d_outputImageGreen,
                                                    d_outputImageBlue,

                                                    xDim,yDim);


      cudaPrintfDisplay(stdout,true);
      cudaPrintfEnd();

      checkCudaErrors(cudaMemcpy(h_outputImageRed, d_outputImageRed, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
      checkCudaErrors(cudaMemcpy(h_outputImageGreen, d_outputImageGreen, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));
      checkCudaErrors(cudaMemcpy(h_outputImageBlue, d_outputImageBlue, (sizeof(unsigned char)*xDim*yDim), cudaMemcpyDeviceToHost));

      //free gpu data
     checkCudaErrors( cudaFree(d_outputImageRed) );
     checkCudaErrors( cudaFree(d_outputImageGreen) );
     checkCudaErrors( cudaFree(d_outputImageBlue) );
     checkCudaErrors( cudaFree(d_inputImageRed) );
     checkCudaErrors( cudaFree(d_inputImageGreen) );
     checkCudaErrors( cudaFree(d_inputImageBlue) );

     //free host data
     free(h_outputImageRed);
     free(h_outputImageGreen);
     free(h_outputImageBlue);
     free(h_inputImageRed);
     free(h_inputImageGreen);
     free(h_inputImageBlue);



      while(true){}
      return 0;
}
4

1 回答 1

3

Your kernel is not launching, which is why you get no output from the printf in the kernel. If you do proper cuda error checking on the kernel launch, you will discover this.

The error returned by your kernel launch is invalid configuration argument.

You are passing invalid values in gridSize.x and gridSize.y.

If you want to see what they are, print them out before calling your kernel. (A general debugging tip.)

Let's take a look at this line, since it's not doing what you think:

 int gridSizeX = ceil(float(xDim/8));
                              ^  ^
                              both values inside the parenthesis are *integers*

You haven't cast either of those values (xDim or 8) to a float. So the host compiler resolves the quantity inside the parenthesis using integer division. Integer division of 3/8 is zero. Nothing after that changes the value. Still zero.

于 2013-06-02T03:39:52.930 回答