我正在尝试使用 CodeXL (或更确切地说是 sprofile)来分析一些 Opencl 代码。在性能计数器模式下进行分析时(但在使用跟踪选项时不会-t
),这总是给我错误的输出,所以我试图找出原因。经过一些实验后,我得出结论,每个内核都执行了 3 次,这会导致内核的错误结果,这些内核修改了一些现有数据而不是覆盖它。以下玩具程序展示了这种行为。
我的问题是:是否有人知道它为什么会这样以及如何阻止它这样做?
我的操作系统是 Fedora Linux 18 CodeXL 版本:CodeXL-Linux-1.1.1537.0 显卡:ATI Technologies Inc 设备 6798
这是执行命令:
/opt/CodeXL-Linux-1.1.1537.0-x86_64-release/Output_x86_64/release/bin/x86_64/sprofile -o example.csv -w . OpenCLExample
我的代码:
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
errNum = clGetPlatformIDs(1,&firstPlatformId, &numPlatforms);
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties,CL_DEVICE_TYPE_GPU,
NULL,NULL,&errNum);
return context;
}
cl_command_queue CreateCommandQueue(cl_context context,cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,0,NULL,&deviceBufferSize);
devices = new cl_device_id[deviceBufferSize/sizeof(cl_device_id)];
errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceBufferSize,devices,NULL);
commandQueue = clCreateCommandQueue(context,devices[0],0,NULL);
*device = devices[0];
delete[] devices;
return commandQueue;
}
cl_program CreateProgram(cl_context context,cl_device_id device,const char* filename)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(filename,std::ios::in);
kernelFile.is_open();
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context,1,
(const char**)&srcStr,
NULL,NULL);
errNum = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
return program;
}
bool CreateMemObjects(cl_context context,cl_mem memObjects[3],float *a,float *b)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float)*ARRAY_SIZE,a,NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float)*ARRAY_SIZE,b,NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float)*ARRAY_SIZE,NULL,NULL);
return true;
}
int main(int arg,char** argv)
{
cl_context context=0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = {0,0,0};
cl_int errNum;
context = CreateContext();
commandQueue = CreateCommandQueue(context,&device);
program = CreateProgram(context,device,"Example.cl");
kernel = clCreateKernel(program,"example_kernel",NULL);
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for(int i=0;i<ARRAY_SIZE;i++)
{
a[i] = i;
b[i] = i*2;
}
if(!CreateMemObjects(context,memObjects,a,b))
return 1;
errNum = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObjects[0]);
errNum |= clSetKernelArg(kernel,1,sizeof(cl_mem),&memObjects[1]);
errNum |= clSetKernelArg(kernel,2,sizeof(cl_mem),&memObjects[2]);
size_t globalWorkSize[1] = {ARRAY_SIZE};
size_t localWorkSize[1] = { 1 };
errNum = clEnqueueNDRangeKernel(commandQueue,kernel,1,NULL,globalWorkSize,localWorkSize,0,
NULL,NULL);
errNum = clEnqueueReadBuffer(commandQueue,memObjects[2], CL_TRUE,
0,ARRAY_SIZE*sizeof(float),result,
0,NULL,NULL);
return 0;
}
核心:
#pragma OPENCL EXTENSION cl_amd_printf : enable
kernel void example_kernel(global const float *a,
global const float *b,
global float *result)
{
int gid = get_global_id(0);
result[gid] = a[gid] * b[gid];
printf((__constant char *)"DEBUG: example_kernel id: %d result: %g\n", gid, result[gid]);
}
这是我得到的结果:
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18