2

研究来自 Fixstars 的示例和源代码。具体来说,我正在尝试第 5 章中的最后一段代码(两条移动平均线 - 又名金十字):

http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/opencl-programming-practice/

该代码可在此处获得:

http://www.fixstars.com/en/opencl/book/sample/

我将在下面发布具体示例。但缺点是通过如下设置命令队列:

command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);

导致无效的 command_queue (clError)。因此,返回数据没有得到正确处理。即,全为零。

但是,如果我将代码设置为仅计算一个移动平均线,而没有CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,那么一切正常。相应地,本章之前的 Fixstars 代码似乎都运行良好。

我正在开发具有 NVIDIA 芯片的全新 MacBook Pro(视网膜)。所以我想知道这是否是 NVIDIA 的实施或其他方面的问题。

无论如何,这对我来说是一个阻碍,因为我最终试图做的是启动多个(相同的)进程,每个进程都有不同的参数,基本上与这个例子试图通过同时计算两个股价移动平均线来做同样的事情。

下面的代码片段。我在他们的源代码中添加了一个调试打印输出功能。因此,当我的程序执行时,我看到以下内容:

从 clGetPlatformIDs 返回代码:成功!
从 clGetDeviceIDs 返回代码:成功!
从 clCreateContext 返回代码:成功!
从 clCreateCommandQueue 返回代码:无效值
从 clBuildProgram 返回代码:成功!
clCreateKernel(13) 的返回代码:成功!
clCreateKernel(26) 的返回代码:成功!
clEnqueueTask(13) 的返回代码:无效的命令队列
clEnqueueTask(26) 的返回代码:无效的命令队列
结果[25]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
结果[26]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
结果[27]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
结果[28]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)

...其余数据也全为零。我正在编译:

gcc -O2 -c moving_average_vec4p.c<br>
gcc moving_average_vec4p.o -o moving_average_vec4p -framework opencl

----(主机代码)moving_average_vec4p.c ----

#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include <stdio.h>

#define NAME_NUM (4) /* Number of stocks */
#define DATA_NUM (100) /* Number of data to process for each stock */

/* Read Stock data */
int stock_array_4[NAME_NUM*DATA_NUM]= {
#include "stock_array_4.txt"
};

/* Moving average width */
#define WINDOW_SIZE_13 (13)
#define WINDOW_SIZE_26 (26)

#define MAX_SOURCE_SIZE (0x100000)

/* DT: added to aid in debugging */

void printCLError (int err) {
  switch (err) {
  case CL_SUCCESS:                            printf("Success!\n"); break;
  case CL_DEVICE_NOT_FOUND:                   printf("Device not found.\n"); break;
  case CL_DEVICE_NOT_AVAILABLE:               printf("Device not available\n"); break;
  case CL_COMPILER_NOT_AVAILABLE:             printf("Compiler not available\n"); break;
  case CL_MEM_OBJECT_ALLOCATION_FAILURE:      printf("Memory object allocation failure\n"); break;
  case CL_OUT_OF_RESOURCES:                   printf("Out of resources\n"); break;
  case CL_OUT_OF_HOST_MEMORY:                 printf("Out of host memory\n"); break;
  case CL_PROFILING_INFO_NOT_AVAILABLE:       printf("Profiling information not available\n"); break;
  case CL_MEM_COPY_OVERLAP:                   printf("Memory copy overlap\n"); break;
  case CL_IMAGE_FORMAT_MISMATCH:              printf("Image format mismatch\n"); break;
  case CL_IMAGE_FORMAT_NOT_SUPPORTED:         printf("Image format not supported\n"); break;
  case CL_BUILD_PROGRAM_FAILURE:              printf("Program build failure\n"); break;
  case CL_MAP_FAILURE:                        printf("Map failure\n"); break;
  case CL_INVALID_VALUE:                      printf("Invalid value\n"); break;
  case CL_INVALID_DEVICE_TYPE:                printf("Invalid device type\n"); break;
  case CL_INVALID_PLATFORM:                   printf("Invalid platform\n"); break;
  case CL_INVALID_DEVICE:                     printf("Invalid device\n"); break;
  case CL_INVALID_CONTEXT:                    printf("Invalid context\n"); break;
  case CL_INVALID_QUEUE_PROPERTIES:           printf("Invalid queue properties\n"); break;
  case CL_INVALID_COMMAND_QUEUE:              printf("Invalid command queue\n"); break;
  case CL_INVALID_HOST_PTR:                   printf("Invalid host pointer\n"); break;
  case CL_INVALID_MEM_OBJECT:                 printf("Invalid memory object\n"); break;
  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    printf("Invalid image format descriptor\n"); break;
  case CL_INVALID_IMAGE_SIZE:                 printf("Invalid image size\n"); break;
  case CL_INVALID_SAMPLER:                    printf("Invalid sampler\n"); break;
  case CL_INVALID_BINARY:                     printf("Invalid binary\n"); break;
  case CL_INVALID_BUILD_OPTIONS:              printf("Invalid build options\n"); break;
  case CL_INVALID_PROGRAM:                    printf("Invalid program\n"); break;
  case CL_INVALID_PROGRAM_EXECUTABLE:         printf("Invalid program executable\n"); break;
  case CL_INVALID_KERNEL_NAME:                printf("Invalid kernel name\n"); break;
  case CL_INVALID_KERNEL_DEFINITION:          printf("Invalid kernel definition\n"); break;
  case CL_INVALID_KERNEL:                     printf("Invalid kernel\n"); break;
  case CL_INVALID_ARG_INDEX:                  printf("Invalid argument index\n"); break;
  case CL_INVALID_ARG_VALUE:                  printf("Invalid argument value\n"); break;
  case CL_INVALID_ARG_SIZE:                   printf("Invalid argument size\n"); break;
  case CL_INVALID_KERNEL_ARGS:                printf("Invalid kernel arguments\n"); break;
  case CL_INVALID_WORK_DIMENSION:             printf("Invalid work dimension\n"); break;
  case CL_INVALID_WORK_GROUP_SIZE:            printf("Invalid work group size\n"); break;
  case CL_INVALID_WORK_ITEM_SIZE:             printf("Invalid work item size\n"); break;
  case CL_INVALID_GLOBAL_OFFSET:              printf("Invalid global offset\n"); break;
  case CL_INVALID_EVENT_WAIT_LIST:            printf("Invalid event wait list\n"); break;
  case CL_INVALID_EVENT:                      printf("Invalid event\n"); break;
  case CL_INVALID_OPERATION:                  printf("Invalid operation\n"); break;
  case CL_INVALID_GL_OBJECT:                  printf("Invalid OpenGL object\n"); break;
  case CL_INVALID_BUFFER_SIZE:                printf("Invalid buffer size\n"); break;
  case CL_INVALID_MIP_LEVEL:                  printf("Invalid mip-map level\n"); break;
  default: printf("Unknown\n");
  }
}


int main(void)
{
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_platforms;
  cl_device_id device_id = NULL;
  cl_uint ret_num_devices;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobj_in = NULL;
  cl_mem memobj_out13 = NULL;
  cl_mem memobj_out26 = NULL;
  cl_program program = NULL;
  cl_kernel kernel13 = NULL;
  cl_kernel kernel26 = NULL;
  cl_event event13, event26;
  size_t kernel_code_size;
  char *kernel_src_str;
  float *result13;
  float *result26;
  cl_int ret;
  FILE *fp;

  int window_num_13 = (int)WINDOW_SIZE_13;
  int window_num_26 = (int)WINDOW_SIZE_26;
  int point_num = (NAME_NUM * DATA_NUM);
  int data_num = (int)DATA_NUM;
  int name_num = (int)NAME_NUM;

  int i, j;
  /* Allocate space to read in kernel code */
  kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE);

  /* Allocate space for the result on the host side */
  result13 = (float *)malloc(point_num*sizeof(float)); /* average over13 weeks */
  result26 = (float *)malloc(point_num*sizeof(float)); /* average over26 weeks */

  /* Get Platform */
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  printf("Return code from clGetPlatformIDs: ");
  printCLError(ret);

  /* Get Device */
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
               &ret_num_devices);
  printf("Return code from clGetDeviceIDs: ");
  printCLError(ret);

  /* Create Context */
  context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  printf("Return code from clCreateContext: ");
  printCLError(ret);

  /* Create Command Queue */

  // DT:  this seems to break it (ie., output is all zeros)
  command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
  printf("Return code from clCreateCommandQueue: ");
  printCLError(ret);

  /* Read kernel source code */
  fp = fopen("moving_average_vec4.cl", "r");
  kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  /* Create Program Object */
  program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
                      (const size_t *)&kernel_code_size, &ret);

  /* Compile kernel */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  printf("Return code from clBuildProgram: ");
  printCLError(ret);

  /* Create kernel */
  kernel13 = clCreateKernel(program, "moving_average_vec4", &ret); /* 13 weeks */
  printf("Return code from clCreateKernel(13): ");
  printCLError(ret);

  kernel26 = clCreateKernel(program, "moving_average_vec4", &ret); /* 26 weeks */
  printf("Return code from clCreateKernel(26): ");
  printCLError(ret);

  /* Create buffer for the input data on the device */
  memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE,
                 point_num * sizeof(int), NULL, &ret);

  /* Create buffer for the result on the device */
  memobj_out13 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 13 weeks */
  memobj_out26 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 26 weeks */

  /* Copy input data to the global memory on the device*/
  ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, point_num * sizeof(int), stock_array_4, 0, NULL, NULL);

  /* Set Kernel Arguments (13 weeks) */
  ret = clSetKernelArg(kernel13, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel13, 1, sizeof(cl_mem), (void *)&memobj_out13);
  ret = clSetKernelArg(kernel13, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel13, 3, sizeof(int), (void *)&window_num_13);

  /* Submit task to compute the moving average over 13 weeks */

  ret = clEnqueueTask(command_queue, kernel13, 0, NULL, NULL);
  printf("Return code from clEnqueueTask(13): ");
  printCLError(ret);

  /* Set Kernel Arguments (26 weeks) */
  ret = clSetKernelArg(kernel26, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel26, 1, sizeof(cl_mem), (void *)&memobj_out26);
  ret = clSetKernelArg(kernel26, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel26, 3, sizeof(int), (void *)&window_num_26);

  /* Submit task to compute the moving average over 26 weeks */
  ret = clEnqueueTask(command_queue, kernel26, 0, NULL, &event26);
  printf("Return code from clEnqueueTask(26): ");
  printCLError(ret);

  // DT:  doesn't seem to help ... ;-(
  ret = clFinish(command_queue);

  /* Copy result for the 13 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out13, CL_TRUE, 0, point_num * sizeof(float), result13, 1, &event13, NULL);

  /* Copy result for the 26 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out26, CL_TRUE, 0, point_num * sizeof(float), result26, 1, &event26, NULL);


  /* OpenCL Object Finalization */
  ret = clReleaseKernel(kernel13);
  ret = clReleaseKernel(kernel26);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobj_in);
  ret = clReleaseMemObject(memobj_out13);
  ret = clReleaseMemObject(memobj_out26);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);


  /* Display results */

  /* DT: also added printout to see if actual numeric values are passing through */

  for (i=window_num_26-1; i < data_num; i++) {
    printf("result[%d]:", i );
    for (j=0; j < name_num; j++ ) {
      /* Display whether the 13 week average is greater */
      printf( "[%d] (%f,%f)", (result13[i*NAME_NUM+j] > result26[i*NAME_NUM+j]),result13[i*NAME_NUM+j],result26[i*NAME_NUM+j] );
    }
    printf("\n");
  }

  /* Deallocate memory on the host */
  free(result13);
  free(result26);
  free(kernel_src_str);

  return 0;
}

----(OPENCL 内核代码)moving_average_vec4.cl ----

__kernel void moving_average_vec4(__global int4 *values,
                  __global float4 *average,
                  int length,
                  int width)
{
  int i;
  int4 add_value; /* A vector to hold 4 components */

  /* Compute sum for the first "width" elements for 4 stocks */
  add_value = (int4)0;
  for (i=0; i < width; i++) {
    add_value += values[i];
  }
  average[width-1] = convert_float4(add_value);

  /* Compute sum for the (width)th ~ (length-1)th elements for 4 stocks */
  for (i=width; i < length; i++) {
    add_value = add_value - values[i-width] + values[i];
    average[i] = convert_float4(add_value);
  }

  /* Insert zeros to 0th ~ (width-2)th element for 4 stocks*/
  for (i=0; i < width-1; i++) {
    average[i] = (float4)(1.1f);
  }

  /* Compute average of (width-1) ~ (length-1) elements for 4 stocks  */
  for (i=width-1; i < length; i++) {
    average[i] /= (float4)width;
  }
}
4

2 回答 2

3

很少有 OpenCL 实现支持乱序命令队列。对于重叠的工作,改为使用多个命令队列和(如乱序队列)事件来同步。

于 2013-06-02T14:13:46.160 回答
0

只需使用 ATI HD6950 设置 Ubuntu 12.04 PC。上面的代码在那里按预期工作。所以我假设这是一个 MacOS 的东西。

通过 ssh 移动到 Aquamacs。

于 2013-05-30T21:48:07.413 回答