2

我有这个为 OpenCL 程序编写的 Mandelbrot 内核。为了测试,我决定将所有复平面都放在一个向量上。我的问题是当我打印输出时,我得到一个 1 的列表(如结果数组的初始化),而不是内核工作的结果。

我哪里有问题?

    #include <iostream>

    #ifdef __APPLE__
    #include <OpenCL/opencl.h>
    #else
    #include <CL/cl.h>
    #endif

    int main(){
    using namespace std;
    int xPixel=100;
    int yPixel=100;
    float ics[xPixel];
    for(int i=0;i<xPixel;++i)
    ics[i]=-2+i*((float)4/xPixel);
  float ypsilon[yPixel];
  for(int i=0;i<yPixel;++i)
    ypsilon[i]=-2+i*((float)4/yPixel);
  int results[xPixel*yPixel];
  for(int i=0;i<xPixel*yPixel;++i)
    results[i]=1;

  cl_context context;
  cl_context_properties properties[3];
  cl_kernel kernel;
  cl_command_queue command_queue;
  cl_program program;
  cl_int err;
  cl_uint num_of_platforms=0;
  cl_platform_id platform_id;
  cl_device_id device_id;
  cl_uint num_of_devices=0;
  cl_mem memX, memY, memOutput;
  size_t global;

const char *KernelSource =
"__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output){\n"\
"size_t id=get_global_id(0);\n"\
"int yPixel=100;\n"\
"for(int i=0;i<yPixel;i++){\n"\
"float x=0;\n"\
"float y=0;\n"\
"int counter=0;\n"\
"while(counter<1000){\n"\
"if(x*x+y*y>2*2){\n"\
"output[(id*yPixel)+i]=counter;\n"\
"break;\n"\
"}\n"\
"float xTemp=x*x-y*y+ics[id];\n"\
"y=2*x*y+ypsilon[i];\n"\
"x=xTemp;\n"\
"counter++;\n"\
"}\n"\
"}\n"\
"}\n";

  // retreives a list of platforms available
  if (clGetPlatformIDs(1, &platform_id, &num_of_platforms)!= CL_SUCCESS){
    cout<<"Unable to get platform_id\n"<<endl;;
    return 1;
  }

  // try to get a supported GPU device
  if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices) != CL_SUCCESS){
    cout<<"Unable to get device_id\n"<<endl;
    return 1;
  }

  //context properties list - nust be terminated with 0
  properties[0]=CL_CONTEXT_PLATFORM;
  properties[1]=(cl_context_properties)platform_id;
  properties[2]=0;

  //create a context with the GPU device
  context=clCreateContext(properties,1,&device_id,NULL,NULL,&err);

  //create a command queue using the context and device
  command_queue=clCreateCommandQueue(context,device_id,0,&err);

  //create a program from the kernel source code
  program=clCreateProgramWithSource(context,1,(const char**)&KernelSource,NULL,&err);

  //compile the program
  if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
    cout<<"Error building program"<<endl;
    return 1;
  }

  //specify which kernel from the program to execute
  kernel=clCreateKernel(program,"mandelbrot",&err);

  //create buffers for input and output
  memX=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*xPixel,NULL,NULL);
  memY=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*yPixel,NULL,NULL);
  memOutput=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(int)*(xPixel*yPixel),NULL,NULL);

  //load data into the input buffer
  clEnqueueWriteBuffer(command_queue,memX,CL_TRUE,0,sizeof(float)*xPixel,ics,0,NULL,NULL);
  clEnqueueWriteBuffer(command_queue,memY,CL_TRUE,0,sizeof(float)*yPixel,ypsilon,0,NULL,NULL);

  //set the argument list for the kernel command
  clSetKernelArg(kernel,0,sizeof(cl_mem),&memX);
  clSetKernelArg(kernel,1,sizeof(cl_mem),&memY);
  clSetKernelArg(kernel,2,sizeof(cl_mem),&memOutput);
  global=xPixel*yPixel;

  //enqueue the kernel command for execution
  clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,&global,NULL,0,NULL,NULL);
  clFinish(command_queue);

  //copy the results from out of the output buffer
  clEnqueueReadBuffer(command_queue,memOutput,CL_TRUE,0,sizeof(int)*(xPixel*yPixel),results,0,NULL,NULL);

  //print output
   for(int i=0;i<xPixel;++i){
     for(int j=0;j<yPixel;++j){
       cout<<results[(i*yPixel)+j]<<" ";
     }
     cout<<endl;
   }

  //cleanup - release OpenCL resources
  clReleaseMemObject(memX);
  clReleaseMemObject(memY);
  clReleaseMemObject(memOutput);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(command_queue);
  clReleaseContext(context);
}
4

1 回答 1

0

我没有看到确切的原因,但我确实有一个问题:如果你在每个元素上运行它,那么 "i" 在 "yPixel" 上循环的目的是什么?看起来你正在做 X*Y*Y 工作而不是 X*Y 工作(你的全局大小是 X*Y 然后内核再次在 Y 上循环)。

如果您在“i”循环之前添加“output[(id*yPixel)+i]=42” ,那么您的输出缓冲区会保存什么?这将告诉您问题出在您的内核还是主机代码中。

为了帮助其他人看到这个,我重新格式化了内核代码:

__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output)
{
  size_t id=get_global_id(0);
  int yPixel=100;
  for(int i=0;i<yPixel;i++)
  {
    float x=0;
    float y=0;
    int counter=0;
    while(counter<1000)
    {
      if(x*x+y*y>2*2)
      {
        output[(id*yPixel)+i]=counter;
        break;
      }
      float xTemp=x*x-y*y+ics[id];
      y=2*x*y+ypsilon[i];
      x=xTemp;
      counter++;
    }
  }
}
于 2013-12-08T02:37:19.407 回答