1

我对嵌入式和 OpenCL 很陌生,我目前正在尝试开发一个示例代码来执行到支持 OpenCL 1.1 EP 的 i.MX6q 板。

我必须从头开始,所以我按照这些教程OpenCL 1.1 参考页面以及这个 OpenCL 示例来制作我的第一个 OpenCL 实现/应用程序。

基本上我想做的是开发一个“性能测试”在板上运行。它包含两个 int 数组(输入和输出),用随机值填充第一个数组,然后使用 OpenCL 工作项将其粘贴到输出数组中。

我对 clEnqueue(Read/Write)Buffer 函数和 clCreateBuffer 标志(尤其是 CL_MEM_USE_HOST_PTR)感到很困惑,所以我决定看看并练习一下。

我的代码可以正确编译并正确运行,但是当我读取输出数组值时,它们仍然保持为 0。

这是我的代码(C++):

void    buffer_copy(char* kernelfile)
{
    cl_platform_id      platform_id;
    cl_device_id        device_id;
    cl_context          context;
    cl_command_queue    cmd_queue;
    cl_program          program;

    //  Retrieving all the OpenCL data needed
    //  to start the performance test
    platform_id = get_platform();
    device_id = get_device(platform_id);
    context = get_context(platform_id, device_id);
    cmd_queue = get_command_queue(context, device_id);
    program = get_program(context, kernelfile);

    cl_mem      buffer_input, buffer_output;
    size_t      buffer_width = 640, buffer_height = 480;
    size_t      buffer_size = buffer_width * buffer_height;
    cl_kernel   kernel;
    cl_int      err = 0;
    char*       options = "-Werror -cl-std=CL1.1";

    int         data_input[buffer_size];
    int         data_output[buffer_size];

    //  Assigning random values in the data_input array and
    //  initializing the data_output array to zero-values
    srand(time(NULL));
    for (size_t index = 0; index < buffer_size; ++index)
    {
        data_input[index] = rand();
        data_output[index] = 0;
    }

    //  Creating OpenCL buffers
    buffer_input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, buffer_size * sizeof(int), data_input, &err);
    assert(err == CL_SUCCESS);
    buffer_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, buffer_size * sizeof(int), data_output, &err);
    assert(err == CL_SUCCESS);

    err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
    assert(err == CL_SUCCESS);
    kernel = clCreateKernel(program, "buffer_copy", &err);
    assert(err == CL_SUCCESS);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_output);

    size_t  device_max_work_group_size;
    size_t  global_work_size, local_work_size;
    size_t  preferred_work_group_size_multiple;

    cl_ulong    global_mem_size, max_mem_alloc_size;
    clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem_size, NULL);
    clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_mem_alloc_size, NULL);
    clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &device_max_work_group_size, NULL);
    std::cout << "Global device memory size: " << global_mem_size << " bytes" << std::endl;
    std::cout << "Device max memory allocation size: " << max_mem_alloc_size << " bytes" << std::endl;
    std::cout << "Device max work group size: " << device_max_work_group_size << std::endl;

    clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &global_work_size, NULL);
    std::cout << "global_work_size value: " << global_work_size << std::endl;

    clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_work_group_size_multiple, NULL);
    local_work_size = global_work_size / preferred_work_group_size_multiple;
    std::cout << "local_work_size value: " << local_work_size << std::endl;

    cl_event events[2];
    err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, 0, &events[0]);
    assert (err == CL_SUCCESS);
    err = clEnqueueReadBuffer(cmd_queue, buffer_output, CL_TRUE, 0, buffer_size * sizeof(int), data_output, 0, NULL, &events[1]);
    assert (err == CL_SUCCESS);
    err = clWaitForEvents(2, events);
    assert (err == CL_SUCCESS);

    for (size_t index = 0; index < buffer_size; ++index)
    {
        if (data_input[index] != data_output[index])
        {
            std::cerr << "Error, values differ (at index " << index << ")." << std::endl;
            break;
        }
        else
        {
            //std::cout << "data_input[index] =\t" << data_input[index] << std::endl;
            //std::cout << "data_output[index] =\t" << data_output[index] << std::endl;
        }
    }

    cl_ulong    time_start, time_end;
    double      total_time;
    clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
    std::cout << "Execution time in milliseconds: " << (total_time / 1000000.0) << " ms" << std::endl;

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseMemObject(buffer_input);
    clReleaseMemObject(buffer_output);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
}

这是我的 OpenCL 内核:

__kernel void   buffer_copy(__global int* input, __global int* output)
{
    int id = get_global_id(0);

    output[id] = input[id];
}

现在我只是想让它工作,而不是优化它。而且我认为我在这里和那里都错过了好点,但我无法抓住它们。在我看来,我混淆了 clCreateBuffer 标志。

你们能启发我并帮助我解决这个问题吗?


编辑:更新代码+新信息!

似乎值粘贴得很好,但仅根据内核工作组大小:CL_DEVICE_MAX_WORK_GROUP_SIZE 返回 1024,CL_KERNEL_WORK_GROUP_SIZE 也返回 1024(这也很奇怪)。所以我的数组的前 1024 个整数被很好地复制/粘贴,但之后它就不再工作了。为了验证这一点,我将 global_work_group_size 手动设置为 32,再次运行我的程序,然后正确粘贴唯一的前 32 个整数。我真的不明白这里发生了什么。

4

1 回答 1

0

我想我能够让它同时适用于我的笔记本电脑和 i.MX6q 板。

这是有效的代码:

void    buffer_copy(char* kernelfile)
{
    cl_platform_id      platform_id;
    cl_device_id        device_id;
    cl_context          context;
    cl_command_queue    cmd_queue;
    cl_program          program;

    //  Retrieving all the OpenCL data needed
    //  to start the performance test
    platform_id = get_platform();
    device_id = get_device(platform_id);
    context = get_context(platform_id, device_id);
    cmd_queue = get_command_queue(context, device_id);
    program = get_program(context, kernelfile);

    cl_mem      buffer_input, buffer_output;
    size_t      buffer_width = 640, buffer_height = 480;
    size_t      buffer_size = buffer_width * buffer_height;
    cl_kernel   kernel;
    cl_int      err = 0;
    char*       options = "-Werror -cl-std=CL1.1";

    int         data_input[buffer_size];
    int         data_output[buffer_size];

    //  Assigning random values in the data_input array and
    //  initializing the data_output array to zero-values
    srand(time(NULL));
    for (size_t index = 0; index < buffer_size; ++index)
    {
        data_input[index] = rand();
        data_output[index] = 0;
    }

    //  Creating OpenCL buffers
    buffer_input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, buffer_size * sizeof(int), data_input, &err);
    assert(err == CL_SUCCESS);
    buffer_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, buffer_size * sizeof(int), data_output, &err);
    assert(err == CL_SUCCESS);

    err = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
    assert(err == CL_SUCCESS);
    kernel = clCreateKernel(program, "buffer_copy", &err);
    assert(err == CL_SUCCESS);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_output);

    cl_ulong    global_mem_size = 0, max_mem_alloc_size = 0;
    size_t      device_max_work_group_size = 0;
    size_t      kernel_work_group_size = 0;
    size_t      preferred_work_group_size_multiple = 0;
    clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem_size, NULL);
    clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_mem_alloc_size, NULL);
    clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &device_max_work_group_size, NULL);
    clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL);
    clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_work_group_size_multiple, NULL);
    std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE : " << global_mem_size << " bytes" << std::endl;
    std::cout << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << max_mem_alloc_size << " bytes" << std::endl;
    std::cout << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << device_max_work_group_size << std::endl;
    std::cout << "CL_KERNEL_WORK_GROUP_SIZE : " << kernel_work_group_size << std::endl;
    std::cout << "CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE : " << preferred_work_group_size_multiple << std::endl;

    cl_event    events[2];
    err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &buffer_size, &kernel_work_group_size, 0, NULL, &events[0]);
    assert (err == CL_SUCCESS);
    err = clEnqueueReadBuffer(cmd_queue, buffer_output, CL_TRUE, 0, buffer_size * sizeof(int), data_output, 1, &events[0], &events[1]);
    assert (err == CL_SUCCESS);
    err = clWaitForEvents(2, events);
    assert (err == CL_SUCCESS);

    for (size_t index = 0; index < buffer_size; ++index)
    {
        if (data_input[index] != data_output[index])
        {
            std::cerr << "Error, values differ (at index " << index << ")." << std::endl;
            break;
        }
    }

    cl_ulong    time_start, time_end;
    double      total_time;

    clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
    std::cout << "clEnqueueNDRangeKernel execution time in milliseconds: " << (total_time / 1000000.0) << " ms" << std::endl;
    clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
    clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
    total_time = time_end - time_start;
    std::cout << "clEnqueueReadBuffer execution time in milliseconds: " << (total_time / 1000000.0) << " ms" << std::endl;

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseMemObject(buffer_input);
    clReleaseMemObject(buffer_output);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
}

如您所见,我只是使用 OpenCL 1.1 EP 将 640*480 (307200) 个整数从一个数组复制到另一个数组。

我从主机端分配了两个内存缓冲区,并告诉 OpenCL 通过主机指针使用它们(如果我是对的,这意味着没有 memcpy)。

这是我的笔记本电脑的输出(在 GeForce GTX 765m 上工作):

CL_DEVICE_GLOBAL_MEM_SIZE : 2094923776 bytes
CL_DEVICE_MAX_MEM_ALLOC_SIZE : 523730944 bytes
CL_DEVICE_MAX_WORK_GROUP_SIZE : 1024
CL_KERNEL_WORK_GROUP_SIZE : 1024
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE : 32

clEnqueueNDRangeKernel execution time in milliseconds: 0.061856 ms
clEnqueueReadBuffer execution time in milliseconds: 0.100544 ms

这是 i.MX6q SoM 的输出(在 Vivante GC2000 GPU 上工作):

CL_DEVICE_GLOBAL_MEM_SIZE : 67108864 bytes
CL_DEVICE_MAX_MEM_ALLOC_SIZE : 33554432 bytes
CL_DEVICE_MAX_WORK_GROUP_SIZE : 1024
CL_KERNEL_WORK_GROUP_SIZE : 176
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE : 16

clEnqueueNDRangeKernel execution time in milliseconds: 4.463 ms
clEnqueueReadBuffer execution time in milliseconds: 7.199 ms

什么问题 ?
我认为我给clEnqueueNDRangeKernel函数提供了错误的global_work_sizelocal_work_size值。然而,我仍然真的不明白它们是如何工作的以及如何计算它们。我仍然不明白这些值和CL_KERNEL_WORK_GROUP_SIZE之间的区别以及 OpenCL 编译器如何计算内核工作组大小。为什么 SoM 和我的笔记本电脑之间的 CL_KERNEL_WORK_GROUP_SIZE 不同?虽然我使用相同的内核。

有什么优化推荐吗?
如果您有任何优化建议给我,我将不胜感激!所有这些内容都是为了学习如何进行一些图像处理和开发算法以使它们与 OpenCL 一起工作(因为我不能在这个 SoM 上使用 OpenCV)。

于 2017-09-24T15:00:53.263 回答