opencl - OpenCL kernel works with certain data types but not others

Question

I'm trying to learn OpenCL, and currently I'm practising making different kernels. In my attempt to make a dot product using reduction methods, I ran into an issue I don't understand. When I run my code with int inputs and output, it works fine. When I change all the int types to float types (for the inputs and outputs) it gives me a result that is close but is slightly off. Can anyone figure out why this is or what is causing it?

Here's the host code

#include <stdio.h>
#include <CL/cl.h>
#include <string.h>
#include <stdlib.h>

/*
*   IF ERROR, PRINT OUT OPENCL ERROR CODE
*/
void errorCheck(cl_int error){
    if(error < CL_SUCCESS){
        printf("Failed - OpenCL error #: %d\n", error);
        //exit(1);
    }
}

int main(){

cl_int err;
cl_uint numPlatforms = 0;
cl_uint numDevices = 0;
cl_platform_id * platformIDs = NULL;
cl_device_id * deviceIDs = NULL;
cl_context context = NULL;
cl_command_queue cmdQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;

FILE *program_handle;
char *program_buffer;
size_t program_size;

cl_mem a_buff = NULL;
cl_mem b_buff = NULL;
cl_mem c_buff = NULL;
int * a = NULL;
int * b = NULL;
int * c = NULL;
int correct_dot = 0;
int ELEMENTS = 1024;

const int datasize = sizeof(int)*ELEMENTS;

a = (int *)malloc(datasize);
b = (int *)malloc(datasize);
c = (int *)malloc(datasize);

int i;
for(i = 0; i<ELEMENTS; i++){
    a[i] = i;
    b[i] = i;
    correct_dot += (int)i*i;
}

/*
**RETRIEVE NUMBER OF PLATFORMS
*/
err = clGetPlatformIDs(0, NULL, &numPlatforms);
errorCheck(err);

/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF PLATFORMS
*/
platformIDs = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
errorCheck(err);

/*
**RETRIEVE NUMBER OF DEVICES
*/
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
errorCheck(err);

/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF DEVICES
*/
deviceIDs = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, deviceIDs, NULL);
errorCheck(err);

/*
**GET INFO ON DEVICE
*/
cl_uint * info;
size_t info_size;

err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, 0, NULL, &info_size);
info = (cl_uint *) malloc(info_size);
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, info_size, info, NULL);
errorCheck(err);


/*
**CREATE CONTEXT
*/
context = clCreateContext(NULL, numDevices, deviceIDs, NULL, NULL, &err);
errorCheck(err);

/*
**CREATE COMMAND QUEUE
*/
cmdQueue = clCreateCommandQueue(context, deviceIDs[0], 0, &err);
errorCheck(err);

/*
**CREATE BUFFERS
*/
a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, a, &err);
errorCheck(err);
b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, b, &err);
errorCheck(err);
c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, datasize, c, &err);
errorCheck(err);

err = clEnqueueWriteBuffer(cmdQueue, a_buff, CL_TRUE, 0, datasize, a, 0, NULL, NULL);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, b_buff, CL_TRUE, 0, datasize, b, 0, NULL, NULL);
errorCheck(err);

/*
**READ PROGRAM FROM FILE
*/
program_handle = fopen("kernels/mvdot.cl", "r");
if(program_handle == NULL) {
    perror("Couldn't find the program file");
    exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);

/*
**CREATE PROGRAM
*/
program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err);
errorCheck(err);

/*
**BUILD PROGRAM
*/
err = clBuildProgram(program, 1, deviceIDs, NULL, NULL, NULL);
errorCheck(err);
if (err == CL_BUILD_PROGRAM_FAILURE) {
    // Determine the size of the log
    size_t log_size;
    clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    // Allocate memory for the log
    char *log = (char *) malloc(log_size);
    // Get the log
    clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
    // Print the log
    printf("%s\n", log);
}

/*
**CREATE KERNEL
*/
kernel = clCreateKernel(program, "mvdot", &err);
errorCheck(err);

/*
**SET UP KERNEL ARGS
*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a_buff);  //input A
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b_buff); //input B
err |= clSetKernelArg(kernel, 2, datasize, NULL);           //local scratch 
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &c_buff); //output C
err |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &ELEMENTS);   //size
errorCheck(err);

/*
**ENQUEUE KERNEL FOR EXECUTION
*/
const size_t global_work_size = ELEMENTS;
const size_t local_work_size = info[0];
err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
errorCheck(err);


/*
**READ OUTPUT
*/
err = clEnqueueReadBuffer(cmdQueue, c_buff, CL_TRUE, 0, datasize, c, 0, NULL, NULL);
errorCheck(err);

/*
**CHECK OUTPUT
*/
int our_dot = 0;
printf("C = [");
//elements/info[0] gives # of work groups
for(i = 0; i<ELEMENTS; i++){
    printf(" %d ", c[i]);
    our_dot += c[i];
}
printf("]\n");

printf("Our dot = %d\n", our_dot);
printf("Correct dot = %d\n", correct_dot);

if(our_dot == correct_dot){
    printf("PASSED\n");
} else {
    printf("FAILED\n");
}

/*
**FREE RESOURCES
*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);

clReleaseMemObject(a_buff);
clReleaseMemObject(b_buff);
clReleaseMemObject(c_buff);

free(platformIDs);
free(deviceIDs);
free(program_buffer);

free(a);
free(b);
free(c);

}

And here's the kernel code.

__kernel void mvdot(
            __global int* a,
        __global int* b,
            __local int* scratch,
            __global int* c,
        __const int size)
{

    int g_id = get_global_id(0);
    int l_id = get_local_id(0);
    int local_size = get_local_size(0);
    int b_id = get_group_id(0);

    //move the product of the two inputs to scratch
    //if our thread is outside of the range of the inputs
    //pad the scratch with 0
    scratch[l_id] = (g_id < size) ? (a[g_id] * b[g_id]) : 0;

    //Use local memory to compute the sum of the products
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = 2; i<=local_size; i*=2) {
        int offset = local_size/i;
        scratch[l_id] = (l_id < offset) ? (scratch[l_id] + scratch[l_id + offset]) : scratch[l_id];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    //Move the final sum for each group to the output
    if(l_id == 0){
        c[b_id] = scratch[0];
    }

}

I'm sure it's something very silly, but I'm just not understanding it.

score 1 · Accepted Answer

在您的本地内存减少中存在竞争条件，因为不同的工作项在连续的循环迭代中读取和写入相同的位置。每次迭代都需要一个屏障，并且还应该避免在不执行任何工作时重写值（因为这会与尝试读取位置的其他工作项冲突）。

编写这种工作组缩减的典型方法如下所示：

//Use local memory to compute the sum of the products
for (int offset = local_size/2; offset > 0; offset/=2) {
    barrier(CLK_LOCAL_MEM_FENCE);
    if (l_id < offset)
      scratch[l_id] = scratch[l_id] + scratch[l_id + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);

如果您知道正在执行的硬件的 SIMD 宽度，则可以展开循环的最后几次迭代并在没有障碍的情况下执行它们（如果工作项以锁步执行，则不再需要同步）。

正如@DarkZeros 在评论中指出的那样，这里的罪魁祸首是浮点的精度（尽管仍然应该处理上面的竞争条件）。您可以通过将数据类型更改为来验证这一点double- 这样做会导致您的代码通过我的 GPU 上的测试。

opencl - OpenCL kernel works with certain data types but not others

1 回答 1

Related

Reference