c++ - 在 OpenCL 上读取错误数据

Question

我遇到了内核写入数据错误的地方或主机有时读取数据不正确的问题。我将相同的数据（写入数据的索引）写入两个具有不同类型的全局数组。为了确保索引正确，使用了通过 atom_inc 递增的全局计数器。从主机上的第二个阵列读取数据时会出现此问题。例如：

.....
output array index: 442: (output1 value:442.0000     output2 value:442) 
output array index: 443: (output1 value:443.0000     output2 value:443) 
output array index: 444: (output1 value:444.0000     output2 value:444) 
output array index: 445: (output1 value:445.0000     output2 value:445) 
output array index: 446: (output1 value:446.0000     output2 value:1152892928) 
output array index: 447: (output1 value:447.0000     output2 value:447) 
output array index: 448: (output1 value:448.0000     output2 value:1152909312) 
output array index: 449: (output1 value:449.0000     output2 value:1152917504) 
output array index: 450: (output1 value:450.0000     output2 value:1152925696)
......

正如您在指标 446、448、449 和 450+ 中看到的，输出 2 包含错误值。这可能是什么原因？

设备：ATI Radeon HD5750

代码示例：

#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>

// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \

"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \

"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";

void art_process_sinogram(const char* tiff_filename,
                          const float *angles2,
                          const unsigned int n_angles2,
                          const unsigned int n_ray2s,
                          const float distanc2e)
{
  /******************************
   * OPENCL ENVIRONMENT
   */
  cl_int status;
  cl_uint numPlatforms = 0;
  cl_platform_id *platforms = NULL;
  cl_device_id device_id;

  //discover platforms
  status = clGetPlatformIDs(0, NULL, &numPlatforms);
  platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
  status = clGetPlatformIDs(numPlatforms, platforms, NULL);

  //discover devices
  cl_uint numDevices = 0;
  cl_device_id *devices = NULL;

  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
  devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
  device_id = devices[1];
  //create context
  cl_context context = NULL;
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

  cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel  kernel_weights = clCreateKernel(program, "kernel1", &status);

  //create queue
  cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);

  /******************************
   * HARDWARE PARAMETERS
   */
  cl_uint wavefronts_per_SIMD = 7;
  size_t global_work_size;
  size_t local_work_size = 64;

  cl_uint max_compute_units;

  clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);

  size_t wg_count = max_compute_units * wavefronts_per_SIMD;
  global_work_size = wg_count * local_work_size;

  /**************************** DATA PART *************************************/

  size_t w_portion_size = 768 * sizeof(cl_float);
  size_t w_pos_portion_size = 768 * sizeof(cl_uint);

  size_t counters_data_size = 6 * sizeof(cl_uint);
  cl_uint counters_data[6];
  counters_data[0] = 1;
  counters_data[1] = 2; // max number of the cells intersected by the ray
  counters_data[2] = 3;
  counters_data[3] = 4;
  counters_data[4] = 5; // same to the number of rays
  counters_data[5] = 0; // counter inside kernel

  /*****************
   * Main buffers
   */
  cl_mem weights1_buffer = clCreateBuffer(context,
                                          CL_MEM_READ_WRITE,
                                          w_portion_size,
                                          NULL,
                                          NULL);

  cl_mem weights_pos1_buffer = clCreateBuffer(context,
                                              CL_MEM_READ_WRITE,
                                              w_pos_portion_size,
                                              NULL,
                                              NULL);
  /*****************
   * Supplement buffers (constant)
   */
  cl_mem counters_data_buffer = clCreateBuffer(context,
                                               CL_MEM_READ_ONLY,
                                               counters_data_size,
                                               NULL,
                                               &status);


  cl_event supplement_buffer_ready[1];

  status = clEnqueueWriteBuffer(command_queue1,
                                counters_data_buffer,
                                CL_FALSE,
                                0,
                                counters_data_size,
                                counters_data,
                                0,
                                NULL,
                                &supplement_buffer_ready[0]);

  status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
  status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
  status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);

  status = clEnqueueNDRangeKernel(command_queue1,
                                  kernel_weights,
                                  1, // work dimensional 1D, 2D, 3D
                                  NULL, // offset
                                  &global_work_size, // total number of WI
                                  &local_work_size, // nomber of WI in WG
                                  1, // num events in wait list
                                  supplement_buffer_ready,  // event wait list
                                  NULL); // event

  clFinish(command_queue1);
  cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
                                                      weights1_buffer,//*pmain_weights_buffer,
                                                      CL_TRUE,
                                                      CL_MAP_READ,
                                                      0,
                                                      w_portion_size,
                                                      0, NULL, NULL, NULL);
  cl_uint *output2 = malloc(w_portion_size);
  status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
                               CL_TRUE, 0, w_pos_portion_size, output2,
                               0, NULL, NULL);

  clFinish(command_queue1);
  for(int i = 0; i < 790; ++i) {
    printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
  }
}

解决方案：

内核应该是这样的（需要检查索引）：

  __kernel void k_1(__global uint *counter, 
                    __global uint *weights, 
                    __global uint2 *weights_pos)
 {
    const uint global_size = get_global_size(0);
    const uint global_id = get_global_id(0);
    uint local_id = get_local_id(0);

    uint insert_index = atom_inc(&counter[5]);
    if(insert_index < 768) {
       weights[insert_index]= insert_index;
       weights_pos[insert_index].x = insert_index;
       weights_pos[insert_index].y = insert_index;
    }
}

score 2 · Accepted Answer

您正在搞乱缓冲区尺寸。

1）您的缓冲区每个包含 768 个元素（请参阅w_portion_size和的初始化w_pos_portion_size）

2) 我机器上的工作组大小是 896（见初始化wg_count）

3) 你打印出 790 个值。

除此之外，这里还有一个概念性错误：

if(global_id == 0) {
     counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]

您不能假设第一个虚拟处理器将在其他虚拟处理器之前执行此行。您应该完全删除此行，因为您counter[5]在主机端进行了初始化。（我相信这是您的问题的原因，但我无法重现）。

解决这些问题后，您的代码似乎运行良好（英特尔实现）。

score 0 · Accepted Answer

内核应该是这样的（需要检查索引）：

__kernel void k_1(__global uint *counter, 
                __global uint *weights, 
                __global uint2 *weights_pos)
{
   const uint global_size = get_global_size(0);
   const uint global_id = get_global_id(0);
   uint local_id = get_local_id(0);

   uint insert_index = atom_inc(&counter[5]);

   if(insert_index < 768) {
     weights[insert_index]= insert_index;
     weights_pos[insert_index].x = insert_index;
    weights_pos[insert_index].y = insert_index;
   }
}

c++ - 在 OpenCL 上读取错误数据

2 回答 2

Related

Reference