I'm trying to learn OpenCL, and currently I'm practising making different kernels. In my attempt to make a dot product using reduction methods, I ran into an issue I don't understand. When I run my code with int inputs and output, it works fine. When I change all the int types to float types (for the inputs and outputs) it gives me a result that is close but is slightly off. Can anyone figure out why this is or what is causing it?
Here's the host code
#include <stdio.h>
#include <CL/cl.h>
#include <string.h>
#include <stdlib.h>
/*
* IF ERROR, PRINT OUT OPENCL ERROR CODE
*/
void errorCheck(cl_int error){
if(error < CL_SUCCESS){
printf("Failed - OpenCL error #: %d\n", error);
//exit(1);
}
}
int main(){
cl_int err;
cl_uint numPlatforms = 0;
cl_uint numDevices = 0;
cl_platform_id * platformIDs = NULL;
cl_device_id * deviceIDs = NULL;
cl_context context = NULL;
cl_command_queue cmdQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
FILE *program_handle;
char *program_buffer;
size_t program_size;
cl_mem a_buff = NULL;
cl_mem b_buff = NULL;
cl_mem c_buff = NULL;
int * a = NULL;
int * b = NULL;
int * c = NULL;
int correct_dot = 0;
int ELEMENTS = 1024;
const int datasize = sizeof(int)*ELEMENTS;
a = (int *)malloc(datasize);
b = (int *)malloc(datasize);
c = (int *)malloc(datasize);
int i;
for(i = 0; i<ELEMENTS; i++){
a[i] = i;
b[i] = i;
correct_dot += (int)i*i;
}
/*
**RETRIEVE NUMBER OF PLATFORMS
*/
err = clGetPlatformIDs(0, NULL, &numPlatforms);
errorCheck(err);
/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF PLATFORMS
*/
platformIDs = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
errorCheck(err);
/*
**RETRIEVE NUMBER OF DEVICES
*/
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
errorCheck(err);
/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF DEVICES
*/
deviceIDs = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, deviceIDs, NULL);
errorCheck(err);
/*
**GET INFO ON DEVICE
*/
cl_uint * info;
size_t info_size;
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, 0, NULL, &info_size);
info = (cl_uint *) malloc(info_size);
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, info_size, info, NULL);
errorCheck(err);
/*
**CREATE CONTEXT
*/
context = clCreateContext(NULL, numDevices, deviceIDs, NULL, NULL, &err);
errorCheck(err);
/*
**CREATE COMMAND QUEUE
*/
cmdQueue = clCreateCommandQueue(context, deviceIDs[0], 0, &err);
errorCheck(err);
/*
**CREATE BUFFERS
*/
a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, a, &err);
errorCheck(err);
b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, b, &err);
errorCheck(err);
c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, datasize, c, &err);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, a_buff, CL_TRUE, 0, datasize, a, 0, NULL, NULL);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, b_buff, CL_TRUE, 0, datasize, b, 0, NULL, NULL);
errorCheck(err);
/*
**READ PROGRAM FROM FILE
*/
program_handle = fopen("kernels/mvdot.cl", "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/*
**CREATE PROGRAM
*/
program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err);
errorCheck(err);
/*
**BUILD PROGRAM
*/
err = clBuildProgram(program, 1, deviceIDs, NULL, NULL, NULL);
errorCheck(err);
if (err == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
/*
**CREATE KERNEL
*/
kernel = clCreateKernel(program, "mvdot", &err);
errorCheck(err);
/*
**SET UP KERNEL ARGS
*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a_buff); //input A
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b_buff); //input B
err |= clSetKernelArg(kernel, 2, datasize, NULL); //local scratch
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &c_buff); //output C
err |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &ELEMENTS); //size
errorCheck(err);
/*
**ENQUEUE KERNEL FOR EXECUTION
*/
const size_t global_work_size = ELEMENTS;
const size_t local_work_size = info[0];
err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
errorCheck(err);
/*
**READ OUTPUT
*/
err = clEnqueueReadBuffer(cmdQueue, c_buff, CL_TRUE, 0, datasize, c, 0, NULL, NULL);
errorCheck(err);
/*
**CHECK OUTPUT
*/
int our_dot = 0;
printf("C = [");
//elements/info[0] gives # of work groups
for(i = 0; i<ELEMENTS; i++){
printf(" %d ", c[i]);
our_dot += c[i];
}
printf("]\n");
printf("Our dot = %d\n", our_dot);
printf("Correct dot = %d\n", correct_dot);
if(our_dot == correct_dot){
printf("PASSED\n");
} else {
printf("FAILED\n");
}
/*
**FREE RESOURCES
*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);
clReleaseMemObject(a_buff);
clReleaseMemObject(b_buff);
clReleaseMemObject(c_buff);
free(platformIDs);
free(deviceIDs);
free(program_buffer);
free(a);
free(b);
free(c);
}
And here's the kernel code.
__kernel void mvdot(
__global int* a,
__global int* b,
__local int* scratch,
__global int* c,
__const int size)
{
int g_id = get_global_id(0);
int l_id = get_local_id(0);
int local_size = get_local_size(0);
int b_id = get_group_id(0);
//move the product of the two inputs to scratch
//if our thread is outside of the range of the inputs
//pad the scratch with 0
scratch[l_id] = (g_id < size) ? (a[g_id] * b[g_id]) : 0;
//Use local memory to compute the sum of the products
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 2; i<=local_size; i*=2) {
int offset = local_size/i;
scratch[l_id] = (l_id < offset) ? (scratch[l_id] + scratch[l_id + offset]) : scratch[l_id];
}
barrier(CLK_LOCAL_MEM_FENCE);
//Move the final sum for each group to the output
if(l_id == 0){
c[b_id] = scratch[0];
}
}
I'm sure it's something very silly, but I'm just not understanding it.