cuda - 使用 cuda driverAPI 函数执行矢量添加代码失败

Question

我正在使用驱动程序版本 5.0 并使用 cuda 5.0 的 GPU 设备上运行。当我使用驱动程序 API 函数完全执行 vecAdd 时，我遇到了错误。我正在处理的代码发布在下面。谁能帮我解决错误？

#include <stdio.h>
#include <stdlib.h>

#include "timer.h"

#include "vecAdd-kernel.ptx.h"
#include</opt/apps/cuda/5.0/include/cuda.h>
#include</opt/apps/cuda/5.0/include/cuda_runtime_api.h>

void compute_vec_add(int N, float *a, float* b, float *c);

int main(int argc,char** argv) {

   static CUcontext ctx;
  CUdevice dev;


  CUdeviceptr d_a, d_b, d_c;
  float *h_a, *h_b, *h_c, *h_temp;
  int i;
  int N = 1024 * 1024*8;

  struct stopwatch_t* timer = NULL;
  long double t_pcie_htd, t_pcie_dth, t_kernel, t_cpu;

  /* Setup timers */
  stopwatch_init ();
  timer = stopwatch_create ();

  /*
    Create the vectors
  */
  h_a = (float *) malloc(sizeof(float) * N);
  h_b = (float *) malloc(sizeof(float) * N);
  h_c = (float *) malloc(sizeof(float) * N);

  /*
    Set the initial values of h_a, h_b, and h_c
  */
  for (i=0; i < N; i++) {
    h_a[i] = (float) (rand() % 100) / 10.0;
    h_b[i] = (float) (rand() % 100) / 10.0;
    h_c[i] = (float) 0.0;
  }

  stopwatch_start (timer);
  CUmodule mod;
  CUfunction vecAddFunc;

     cuInit(0);

     cuDeviceGet(&dev, 0);
     cuCtxCreate(&ctx, 0, dev);

   cuModuleLoadData(&mod, (char *) imageBytes);
   cuModuleGetFunction(&vecAddFunc, mod, "vecAdd");

  size_t offset = 0;

   cuMemAlloc(&d_a,sizeof(float) * N); 
   cuMemAlloc(&d_b,sizeof(float) * N); 
   cuMemAlloc(&d_c,sizeof(float) * N); 

   cuMemcpyHtoD(d_a,h_a,sizeof(float) * N);
   cuMemcpyHtoD(d_b,h_b,sizeof(float) * N);
   cuMemcpyHtoD(d_c,h_c,sizeof(float) * N);

  cuParamSetv(vecAddFunc, offset, &d_a, sizeof(d_a));
  offset += sizeof(d_a);

  cuParamSetv(vecAddFunc, offset, &d_b, sizeof(d_b));
  offset += sizeof(d_b);

  cuParamSetv(vecAddFunc, offset, &d_c, sizeof(d_c));
  offset += sizeof(d_c);

  cuParamSetSize(vecAddFunc, offset);

  cuFuncSetBlockShape (vecAddFunc, 256, 1, 1);

  cuLaunchGrid(vecAddFunc, N/256, 1);
  cuStreamSynchronize(0);

  cuMemcpyDtoH(h_c,d_c,sizeof(float) * N);

  t_kernel = stopwatch_stop (timer);
  fprintf (stderr, "Time to execute GPU kernel: %Lg secs\n",t_kernel);


  /*
    Double check errors
  */
  h_temp = (float *) malloc(sizeof(float) * N);
  stopwatch_start (timer);
  compute_vec_add (N, h_a, h_b, h_temp);
  t_cpu = stopwatch_stop (timer);
  fprintf (stderr, "Time to execute CPU program: %Lg secs\n",
           t_cpu);

  int cnt = 0;
  for(int i = 0; i < N; i++) {
    if(abs(h_temp[i] - h_c[i]) > 1e-5) cnt++;
  }
  fprintf(stderr, "number of errors: %d out of %d\n", cnt, N);

  /*
    Free the host memory
  */
  free(h_a);
  free(h_b);
  free(h_c);


   cuMemFree(d_a);
   cuMemFree(d_b);
   cuMemFree(d_c);
   cuCtxDestroy(ctx);

  /*
    Free timer
  */
  stopwatch_destroy (timer);

  if(cnt == 0) {
    printf("\n\nSuccess\n");
  }
}

void
compute_vec_add(int N, float *a, float* b, float *c) {
  int i;
  for (i=0;i<N;i++)
    c[i]=a[i]+b[i];
}

当我使用 cuda-gdb 查找错误时，输出为：

计时器：gettimeofday 计时器分辨率：~ 1 us (?) [新线程 0x40200940 (LWP 32058)] [在设备 0 上创建上下文 0x645a80 的上下文] 警告：检测到 Cuda API 错误：返回 cuModuleLoadData (0xd1)

警告：检测到 Cuda API 错误：返回 cuModuleGetFunction (0x190)

警告：检测到 Cuda API 错误：返回 cuParamSetv (0x190)

警告：检测到 Cuda API 错误：返回 cuParamSetSize (0x190)

警告：检测到 Cuda API 错误：返回 cuFuncSetBlockShape (0x190)

程序收到信号 SIGSEGV，分段错误。0x00002aaaaadab756 在?? () 来自 /usr/lib64/libcuda.so.1

谁能建议我为什么会遇到错误并帮助我解决它？

我正在使用的内核代码是

/*
 * **CUDA KERNEL**
 *
 * Compute the sum of two vectors
 *   C[i] = A[i] + B[i]
 *
 */



extern "C" {


  __global__ void vecAdd(float* a, float* b, float* c)
  {
    /* Calculate index for this thread */
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    /* Compute the element of C */
    c[i] = a[i] + b[i];
  }

}

我使用的编译命令是

LDFLAGS = -I/usr/local/cuda/include \
          -L/usr/local/cuda/lib64 \


vecAdd-dummy: vecAdd-dummy.cu timer.o vecAdd-kernel.ptx.h
        nvcc -o vecAdd-dummy -arch=sm_20 vecAdd-dummy.cu timer.c ${LDFLAGS} -lcuda -g -G 

vecAdd-kernel.ptx : vecAdd-kernel.cu
        nvcc -arch=sm_20 -ptx $^ -o $@

vecAdd-kernel.ptx.h : vecAdd-kernel.ptx
        bin2c -t "char" $^ > $@

我使用的 GPU 设备是 Tesla M2090

score 2 · Accepted Answer

检测到 Cuda API 错误：返回 cuModuleLoadData (0xd1)

/**
 * This indicates that there is no kernel image available that is suitable
 * for the device. This can occur when a user specifies code generation
 * options for a particular CUDA source file that do not include the
 * corresponding device configuration.
 */
CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,

其余错误是由于加载模块失败造成的。

函数 cuParamSet* 和 cuFuncSetBlockShaper 在 CUDA 4.0 中已弃用。修复模块加载问题后，我建议您查看函数 cuLaunchKernel。

cuda - 使用 cuda driverAPI 函数执行矢量添加代码失败

1 回答 1

Related

Reference