1

我制作了向量加法内核并在单个 gpu 和多个 gpu 中运行它。然而,在多 gpu 的情况下,在相同长度的向量加法中,它比单个 gpu 慢得多。

我的代码结构是一个上下文,一个内核和多个队列,它们具有相同数量的设备。在多 GPU 情况下,如何更快地进行修改?

代码如下

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#include <CL/cl.h>
#include <math.h>

//#define VECTOR_SIZE 640000
//#define LOCAL_SIZE 64

#define CHECK_ERROR(err) \
  if (err != CL_SUCCESS) { \
    printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
    exit(EXIT_FAILURE); \
  }

double get_time() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)1e-6 * tv.tv_usec;
}

char *get_source_code(const char *file_name, size_t *len) {
  char *source_code;
  size_t length;
  FILE *file = fopen(file_name, "r");
  if (file == NULL) {
    printf("[%s:%d] Failed to open %s\n", __FILE__, __LINE__, file_name);
    exit(EXIT_FAILURE);
  }

  fseek(file, 0, SEEK_END);
  length = (size_t)ftell(file);
  rewind(file);

  source_code = (char *)malloc(length + 1);
  fread(source_code, length, 1, file);
  source_code[length] = '\0';

  fclose(file);

  *len = length;
  return source_code;
}

int main() {
  // OpenCl variables
  cl_platform_id platform;
  //cl_device_id device;
  cl_device_id *devices;
  cl_device_id device_temp;

  cl_context context;
  //cl_command_queue queue;
  cl_command_queue *queues;

  cl_mem bufferA, bufferB, bufferC;
  cl_program program;
  char *kernel_source;
  size_t kernel_source_size;
  
  cl_kernel kernel;
  //cl_kernel *kernels;

  cl_int err;

  //
  
  
  size_t VECTOR_SIZE = 64000000 ;
  int num_devices = 4;
  size_t LOCAL_SIZE = 64;
  
  // Time variables
  double start;
  double end;

  // Get platform
  err = clGetPlatformIDs(1, &platform, NULL);
  CHECK_ERROR(err);

  // Get GPU device
  
  devices = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL);
  //err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
  CHECK_ERROR(err);

  // Create context
  context = clCreateContext(NULL,num_devices, devices , NULL, NULL , &err);
  //context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
  CHECK_ERROR(err);

  // Get kernel code
  kernel_source = get_source_code("kernel.cl", &kernel_source_size);

  // Create program
  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source,
    &kernel_source_size, &err);
  CHECK_ERROR(err);

  // Build program
  err = clBuildProgram(program, num_devices, devices, "", NULL, NULL);
  
  if(err == CL_BUILD_PROGRAM_FAILURE) {
    size_t log_size;
    char *log;

    // Get program build
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
    //  0, NULL, &log_size);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
    CHECK_ERROR(err);
    
    // Get build log
    log = (char*)malloc(log_size + 1);
    //err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 
    //  log_size, log, NULL);
    err = clGetProgramBuildInfo(program,devices[0],CL_PROGRAM_BUILD_LOG,log_size,log,NULL);
    CHECK_ERROR(err);

    log[log_size] = '\0';
    printf("Compiler error : \n%s\n", log);
    free(log);
    exit(0);
  }
  CHECK_ERROR(err);
  // Create Vector A, B, C
  float *A = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *B = (float*)malloc(sizeof(float) * VECTOR_SIZE);
  float *C = (float*)malloc(sizeof(float) * VECTOR_SIZE);

  // Initial Vector A, B
  //cl_ushort idx;
  /*for(idx = 0; idx < VECTOR_SIZE; idx++) {
    A[idx] = rand() % 100;
    B[idx] = rand() % 100;
  }*/
  printf("start\n");
  start = get_time();
  for(int i = 0; i <VECTOR_SIZE; i++){
      A[i] = sinf(i)*sinf(i);
      B[i] = cosf(i)*cosf(i);
  }
  end = get_time();
  printf("Initialization time : %f seconds elapsed\n", end-start);
  
  
  // Create kernel
  /*kernels = (cl_kernel *) malloc(sizeof(cl_kernel)*num_devices);
  for(int i=0; i<num_devices; i++){
      kernels[i] = clCreateKernel(program,"vec_add", &err);
      CHECK_ERROR(err);
  }*/
  kernel = clCreateKernel(program, "vec_add", &err);
  CHECK_ERROR(err);

  // Create Buffer
  bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);

  bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * VECTOR_SIZE, NULL, &err);
  CHECK_ERROR(err);
  
  printf("error hi\n");
  // Create command-queue
  queues = (cl_command_queue *) malloc(sizeof(cl_command_queue)*num_devices);
  for(int i=0; i<num_devices; i++){
      if (i==0){
          queues[i] = clCreateCommandQueue(context,devices[i],CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,&err);
          CHECK_ERROR(err);
      }
      else{
          queues[i] = clCreateCommandQueue(context,devices[i], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
          CHECK_ERROR(err);
      }
  }
  printf("error bye\n");
 
  //queue = clCreateCommandQueue(context, device, 0, &err);
  //CHECK_ERROR(err);

  // Write Buffer
  for (int i = 0; i<num_devices; i++){
      err = clEnqueueWriteBuffer(queues[i],bufferA,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,A,0,NULL,NULL);
      CHECK_ERROR(err);
      err = clEnqueueWriteBuffer(queues[i],bufferB,CL_FALSE,0,sizeof(float)*VECTOR_SIZE,B,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueWriteBuffer(queue, bufferA, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, A, 0, NULL, NULL);
  //CHECK_ERROR(err);
  //err = clEnqueueWriteBuffer(queue, bufferB, CL_FALSE, 0, sizeof(float) * VECTOR_SIZE, B, 0, NULL, NULL);
  //CHECK_ERROR(err);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }


  // Set Kernel arguments
  start = get_time();
  /*for(int i=0; i<num_devices; i++){
      err=clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &bufferA);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &bufferB);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &bufferC);
      CHECK_ERROR(err);
      err=clSetKernelArg(kernels[i], 3, sizeof(unsigned int), &VECTOR_SIZE);
      CHECK_ERROR(err);
  }*/
  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
  CHECK_ERROR(err);
  err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &VECTOR_SIZE);
  CHECK_ERROR(err);

  end = get_time();

  printf("Send Vector A, B to GPU : %f seconds elapsed\n", end - start);

  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  cl_event ooo_events[num_devices];
  start = get_time();
  // Execute Kernel
  size_t global_size = VECTOR_SIZE;
  size_t local_size = LOCAL_SIZE;
  for(int i=0; i<num_devices; i++){
      //start=get_time();
      
      err= clEnqueueNDRangeKernel(queues[i],kernel,1,NULL,&global_size,&local_size,0,NULL,NULL);
      CHECK_ERROR(err);
      //err = clEnqueueNDRangeKernel(queues[i],kernels[i],1,NULL,&global_size, &local_size,0,NULL,NULL);
      //CHECK_ERROR(err);
      //end=get_time();
      //printf("Calculate C : %f seconds elapsed\n", end-start);
  }
  //err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,&global_size, &local_size, 0, NULL, NULL);
  //CHECK_ERROR(err);
  for(int i=0; i<num_devices; i++){
      err=clFinish(queues[i]);
      CHECK_ERROR(err);
  }

  end = get_time();

  printf("Calculate C : %f seconds elapsed\n", end - start);

  // Read Buffer
  start = get_time();
  for(int i=0; i<num_devices; i++){
      err = clEnqueueReadBuffer(queues[i],bufferC,CL_TRUE,0,sizeof(float)*VECTOR_SIZE,C,0,NULL,NULL);
      CHECK_ERROR(err);
  }
  //err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, sizeof(float) * VECTOR_SIZE, C, 0, NULL, NULL);
  //CHECK_ERROR(err);

  end = get_time();
  printf("Receive C from GPU : %f seconds elapsed\n", end - start);

  // Evaluate Vector C
  start = get_time();
  double sum = 0;
  for(int i = 0; i < VECTOR_SIZE; i++) {
    sum += C[i];
  }
  end = get_time();
  printf("Verification time : %f seconds elapsed\n", end-start);
  printf("%lf, %ld \n", sum,VECTOR_SIZE);
  if (abs(VECTOR_SIZE - sum) < 1) {
    printf("Verification success!\n");
  }
  printf("Sum : %f\n", sum);

  // Release OpenCL object
  clReleaseMemObject(bufferA);
  clReleaseMemObject(bufferB);
  clReleaseMemObject(bufferC);
  free(A);
  free(B);
  free(C);
  clReleaseKernel(kernel);
  //clReleaseKernel(kernels[0]);
  //clReleaseKernel(kernels[1]);
  clReleaseProgram(program);
  
  clReleaseCommandQueue(queues[0]);
  clReleaseCommandQueue(queues[1]);
  //clReleaseCommandQueue(queue);
  clReleaseContext(context);

  return 0;
}
4

1 回答 1

1

仅当每个 GPU 执行的计算工作量比通信、调度和同步开销花费更多时间时,使用多个 GPU 在性能方面才有好处。对于单个 GPU 也是如此。

在您的情况下,每个 GPU 都会执行一个简单的向量加法。但这很少会花费更多时间,然后将数据传输到 GPU,等待内核实际安排执行,等等。

您的代码不是测量总内核执行时间,而是测量调度开销。

I would advise you to use proper GPU profiling tools (depending on your GPU vendor) instead of manual CPU timings to properly examine what is going on. You can also try measuring kernel execution time via events.

于 2020-11-21T10:05:14.540 回答