我在 Visual Studio 2010 中构建了一个项目,该项目使用一个 mexfunction 和一个在 Cuda 中调用内核函数的包装函数。我的问题是,当我尝试读取传递给包装函数的数据时,程序崩溃了。我在下面粘贴了一些代码,并在问题发生的确切位置添加了一些注释。
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// input validation
if (nrhs != 2 || nlhs > 1) {
mexErrMsgTxt("Wrong number of input/output arguments.");
}
if (!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1])) {
mexErrMsgTxt("Inputs must be single arrays.");
}
if (mxIsComplex(prhs[0]) || mxIsComplex(prhs[1])) {
mexErrMsgTxt("Inputs must be real arrays.");
}
if (mxIsSparse(prhs[0]) || mxIsSparse(prhs[1])) {
mexErrMsgTxt("Inputs must be dense arrays.");
}
if (mxGetNumberOfElements(prhs[0]) != mxGetNumberOfElements(prhs[1])) {
mexErrMsgTxt("Inputs must have the same size.");
}
// create ouput array
mwSize numel = mxGetNumberOfElements(prhs[0]);
mwSize ndims = mxGetNumberOfDimensions(prhs[0]);
const mwSize *dims = mxGetDimensions(prhs[0]);
int rows = mxGetM(prhs[0]); /* Get the dimensions of A */
int cols = mxGetN(prhs[0]);
//plhs[0] = mxCreateNumericArray(1, dims, mxSINGLE_CLASS, mxREAL);
//plhs[0] = mxCreateDoubleMatrix(rows,1,mxREAL);
// Create a rows-by-3 real float
plhs[0] = mxCreateNumericMatrix(rows, 1, mxSINGLE_CLASS, mxREAL);
// get pointers to data
float *h_c = (float*) mxGetData(plhs[0]);
float *h_a = (float*) mxGetData(prhs[0]);
float *h_b = (float*) mxGetData(prhs[1]);
myGPU::cudaFunction_wrapper(h_a, h_b, h_c, rows, cols);
在.cu文件中存在以下代码。
namespace myGPU
{//begin namespace
extern "C++" void cudaFunction_wrapper( float* h_A, float* h_B, float* h_C, int rows, int cols );
__global__ void cudaFunction( float* A, float* B, float* C, int rows, int cols )
{
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int m,n;
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
C[m] = A[m + rows*n];
}
void cudaFunction_wrapper( float* h_A, float* h_B, float* h_C, int rows, int cols )
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
int numElements = rows * cols;
size_t size = numElements * sizeof(float);
// Allocate the device input matrix B
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device output matrix C
float *d_C = NULL;
//the returnen value is a vector
err = cudaMalloc((void **)&d_C, rows * sizeof(float) );
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
/*
/////////////////////////////////////////////////////
///////////// Works WHEN ACCESS h_A /////////////////
/////////////////////////////////////////////////////
int m,n;
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
mexPrintf("%f \n", h_A[m + rows*n]) ;
/////////////////////////////////////////////////////
///////////// IT CRASHES HERE WHEN ACCESS d_B ///////
/////////////////////////////////////////////////////
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
mexPrintf("%f \n", d_B[m + rows*n]) ;
*/
cudaFunction<<<blocksPerGrid, threadsPerBlock>>>( d_A, d_B, d_C, rows, cols );
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, rows * sizeof(float) , cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_C);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
}//end namespace
我还想问的第二个问题是,我们如何通过附加在 VS 中使用Nsight调试进程。我按照http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Attach_CUDA_to_Process.htm中的说明进行操作,但无法启用附加按钮。顺便说一句,Matlab 是我想附上的程序。
先感谢您。
PS:Win 7 84x,CUDA SDK 5.5,Visual Studio 2010,Matlab 2011a