我正在尝试添加 2 个矩阵,每个矩阵包含 100 个单元格。我需要在并行任务中完成它。没有数据并行。我得到以下代码,在同一个矩阵中添加、乘法、减法、除法,但是当我运行它时它只返回 0,或者有时返回 2、-0、-2 等...
我需要在 MAC 中使用 OpenCL 来做这件事有什么想法吗?
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define MAX_SOURCE_SIZE (0x100000)
const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 0; \n" \
" \n" \
" C[base+0] = A[base+0] + B[base+0]; \n" \
" C[base+4] = A[base+4] + B[base+4]; \n" \
" C[base+8] = A[base+8] + B[base+8]; \n" \
" C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 1; \n" \
" \n" \
" C[base+0] = A[base+0] - B[base+0]; \n" \
" C[base+4] = A[base+4] - B[base+4]; \n" \
" C[base+8] = A[base+8] - B[base+8]; \n" \
" C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 2; \n" \
" \n" \
" C[base+0] = A[base+0] * B[base+0]; \n" \
" C[base+4] = A[base+4] * B[base+4]; \n" \
" C[base+8] = A[base+8] * B[base+8]; \n" \
" C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 3; \n" \
" \n" \
" C[base+0] = A[base+0] / B[base+0]; \n" \
" C[base+4] = A[base+4] / B[base+4]; \n" \
" C[base+8] = A[base+8] / B[base+8]; \n" \
" C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
int i, j;
float* A;
float* B;
float* C;
A = (float*)malloc(4*4*sizeof(float));
B = (float*)malloc(4*4*sizeof(float));
C = (float*)malloc(4*4*sizeof(float));
/* Initialize input data */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
A[i*4+j] = i*4+j+1;
B[i*4+j] = j*4+i+1;
}
}
/* Get platform/device information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue */
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
/* Create buffer object */
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
/* Copy input data to memory buffer */
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create task parallel OpenCL kernel */
kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);
/* Set OpenCL kernel arguments */
for (i=0; i<4; i++) {
ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
}
/* Execute OpenCL kernel as task parallel */
for (i=0; i<4; i++) {
ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
}
/* Copy result to host */
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
/* Display result */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
printf("%7.2f ", C[i*4+j]);
}
printf("\n");
}
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel[0]);
ret = clReleaseKernel(kernel[1]);
ret = clReleaseKernel(kernel[2]);
ret = clReleaseKernel(kernel[3]);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}