1

我正在尝试添加 2 个矩阵,每个矩阵包含 100 个单元格。我需要在并行任务中完成它。没有数据并行。我得到以下代码,在同一个矩阵中添加、乘法、减法、除法,但是当我运行它时它只返回 0,或者有时返回 2、-0、-2 等...

我需要在 MAC 中使用 OpenCL 来做这件事有什么想法吗?

#include <stdio.h>
#include <stdlib.h>

#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE (0x100000)

const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 0; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  + B[base+0]; \n" \
"    C[base+4]  = A[base+4]  + B[base+4]; \n" \
"    C[base+8]  = A[base+8]  + B[base+8]; \n" \
"    C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 1; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  - B[base+0]; \n" \
"    C[base+4]  = A[base+4]  - B[base+4]; \n" \
"    C[base+8]  = A[base+8]  - B[base+8]; \n" \
"    C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 2; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  * B[base+0]; \n" \
"    C[base+4]  = A[base+4]  * B[base+4]; \n" \
"    C[base+8]  = A[base+8]  * B[base+8]; \n" \
"    C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 3; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  / B[base+0]; \n" \
"    C[base+4]  = A[base+4]  / B[base+4]; \n" \
"    C[base+8]  = A[base+8]  / B[base+8]; \n" \
"    C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";

int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem Amobj = NULL;
    cl_mem Bmobj = NULL;
    cl_mem Cmobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    int i, j;
    float* A;
    float* B;
    float* C;

    A = (float*)malloc(4*4*sizeof(float));
    B = (float*)malloc(4*4*sizeof(float));
    C = (float*)malloc(4*4*sizeof(float));

    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            B[i*4+j] = j*4+i+1;
        }
    }

    /* Get platform/device information */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);

    /* Create buffer object */
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);

    /* Copy input data to memory buffer */
    ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);

    /* Create kernel from source */
    program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create task parallel OpenCL kernel */
    kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
    kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
    kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
    kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);

    /* Set OpenCL kernel arguments */
    for (i=0; i<4; i++) {
        ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
        ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
        ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
    }

    /* Execute OpenCL kernel as task parallel */
    for (i=0; i<4; i++) {
        ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
    }

    /* Copy result to host */
    ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

    /* Display result */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }

    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel[0]);
    ret = clReleaseKernel(kernel[1]);
    ret = clReleaseKernel(kernel[2]);
    ret = clReleaseKernel(kernel[3]);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(Amobj);
    ret = clReleaseMemObject(Bmobj);
    ret = clReleaseMemObject(Cmobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(A);
    free(B);
    free(C);

    return 0;
}
4

1 回答 1

1

command_queue 没有像clCreateCommandQueue返回一样被创建-35: CL_INVALID_QUEUE_PROPERTIES,因此基本上没有任何工作超出此范围(甚至没有内核可以运行)。您只是打印出C矩阵内存映射到的任何随机内存值(因为它未初始化)。您确实需要检查所有 API 调用的返回值是否存在错误,这会立即突出显示。

该错误与您对CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE标志的使用有关。这显然不受支持,并且无论如何它并没有真正做到你想要的。该标志告诉 OpenCL 运行时,对于给定的 queue,内核不需要按照它们入队的相同顺序执行。但是操作的本质仍然是具有数据并行性的顺序内核执行。这与同时运行内核不同,这是您想要的任务并行执行。

您需要做的是创建四个命令队列,每个内核一个。然后,您可以等待所有队列完成的事件。但是,如果您共享相同的输出矩阵,则需要小心,以确保您不会意外引入竞争条件。

任务并行模型在 OpenCL 1.2 参考手册的第 3.4.2 节中进行了描述。当像这样运行多个队列时,您可能希望使用事件来跟踪每个队列的执行和完成状态。有关详细信息,请参阅参考资料的第 5.9 节。

这是您的测试代码,更新了多个队列并并行运行任务。我快速验证了结果是否正确。

#include <stdio.h>
#include <stdlib.h>

#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE (0x100000)

const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 0; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  + B[base+0]; \n" \
"    C[base+4]  = A[base+4]  + B[base+4]; \n" \
"    C[base+8]  = A[base+8]  + B[base+8]; \n" \
"    C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 1; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  - B[base+0]; \n" \
"    C[base+4]  = A[base+4]  - B[base+4]; \n" \
"    C[base+8]  = A[base+8]  - B[base+8]; \n" \
"    C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 2; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  * B[base+0]; \n" \
"    C[base+4]  = A[base+4]  * B[base+4]; \n" \
"    C[base+8]  = A[base+8]  * B[base+8]; \n" \
"    C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 3; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  / B[base+0]; \n" \
"    C[base+4]  = A[base+4]  / B[base+4]; \n" \
"    C[base+8]  = A[base+8]  / B[base+8]; \n" \
"    C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";

int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue[4] = {NULL, NULL, NULL, NULL};
    cl_mem Amobj = NULL;
    cl_mem Bmobj = NULL;
    cl_mem Cmobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    int i, j;
    float* A;
    float* B;
    float* C;

    A = (float*)malloc(4*4*sizeof(float));
    B = (float*)malloc(4*4*sizeof(float));
    C = (float*)malloc(4*4*sizeof(float));

    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            printf("A[%u] = %u\n", i*4+j, i*4+j+1);
            B[i*4+j] = j*4+i+1;
            printf("B[%u] = %u\n", i*4+j, j*4+i+1);
        }
    }

    /* Get platform/device information */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create buffer object */
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);

    /* Set up each queue */
    for (i = 0; i < 4; i++)
    {
        command_queue[i] = clCreateCommandQueue(context, device_id, 0, &ret);

        /* Copy input data to memory buffer */
        ret = clEnqueueWriteBuffer(command_queue[i], Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
        ret = clEnqueueWriteBuffer(command_queue[i], Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
    }

    /* Create kernel from source */
    program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create task parallel OpenCL kernel */
    kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
    kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
    kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
    kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);

    /* Set OpenCL kernel arguments */
    for (i=0; i<4; i++) {
        ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
        ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
        ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
    }

    /* Execute OpenCL kernel as task parallel */
    for (i=0; i<4; i++) {
        ret = clEnqueueTask(command_queue[i], kernel[i], 0, NULL, NULL);
    }

    /* Wait for each queue to finish */
    for (i=0; i<4; i++) {
        printf("Waiting for %u to finish...\n", i);
        ret = clFinish(command_queue[i]);
    }

    ret = clEnqueueReadBuffer(command_queue[0], Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

    /* Display result */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }

    /* Finalization */
    ret = clReleaseKernel(kernel[0]);
    ret = clReleaseKernel(kernel[1]);
    ret = clReleaseKernel(kernel[2]);
    ret = clReleaseKernel(kernel[3]);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(Amobj);
    ret = clReleaseMemObject(Bmobj);
    ret = clReleaseMemObject(Cmobj);
    ret = clReleaseCommandQueue(command_queue[0]);
    ret = clReleaseCommandQueue(command_queue[1]);
    ret = clReleaseCommandQueue(command_queue[2]);
    ret = clReleaseCommandQueue(command_queue[3]);
    ret = clReleaseContext(context);

    free(A);
    free(B);
    free(C);

    return 0;
}
于 2012-04-20T05:09:44.320 回答