0

我正在从 Matthew Scarpino 的“OpenCL in action”一书中学习 OpenCL 。第 3 章包含有关缓冲区(pp.45-47)和子缓冲区(pp.47-48)组织的材料。第 47 页上的用于创建子缓冲区的代码示例存在不准确之处,此处此处的相同问题对此进行了披露。我更进一步并决定调查将存储在子缓冲区中的值传输回主机程序的情况。

我的主机程序正在将一个整数数组传输iaArray1[5] = { 1, 2, 3, 4, 5 }到内核中。缓冲区memObjArray1用于执行此操作。从iaArray1数组ipaArray2中获得数组,将值存储{ 3, 4, 5, 6, 7 }为内核将数组的值与常数 2 相加。memObjArray2输出缓冲区用于将ipaArray2数组的值完全从设备传输到主机程序。接下来,memObjSubArray从缓冲区形成子memObjArray2缓冲区。正在尝试将数据从设备内存子缓冲区传输memObjSubArray到主机程序。

我相信缓冲区和子缓冲区的数据从内核传输到主机程序的机制是相同的。为此,我使用了相同的函数clEnqueueReadBuffer(),但程序给出了错误消息。我究竟做错了什么?应该使用什么函数将数据从设备内存子缓冲区传输到主机程序?

内核函数如下:

__kernel void good (global int* iaArray1, global int* iaArray2)
{
    int i=get_global_id(0);
    iaArray2[i]=iaArray1[i]+2;
}

这是我的程序的代码。所呈现的程序有几个简化。首先,简化了出口分支以缩短代码。其次,原始程序设计为使用多个 cl 文件,因此其中一些变量是数组。

#include <CL\cl.h>
#include <stdio.h>
#include <stdlib.h>

#define PROGRAM_FILE_1 "good.cl"
//#define PROGRAM_FILE_2 "bad.cl"
//#define PROGRAM_FILE_3 "setminusone.cl"
#define NUM_OF_FILES 1

int main(){
    cl_platform_id *platforms;
    cl_uint numOfPlatforms;
    cl_int status;
    cl_device_id *devices;
    cl_uint numOfDevices;
    char caDeviceName[500];
    cl_context context;

    const char * kcpaFileName[NUM_OF_FILES] = { PROGRAM_FILE_1};
    FILE * pProgramHandler;
    char * cpaProgramBuffer[NUM_OF_FILES];
    size_t saProgramSize[NUM_OF_FILES] = { 0};
    cl_uint numOfEnters[NUM_OF_FILES] = { 0};

    cl_program program;
    const char kcaOptions[] = "-cl-finite-math-only -cl-no-signed-zeros";
    size_t sLogSize = 0;
    char * cpProgramLog;

    cl_uint numOfKernels = 0;
    cl_kernel * kernels;
    char caKernelName[20];

    cl_command_queue cmdQueue0;

    printf("Establishing number of available platforms... ");
    status = clGetPlatformIDs(NULL, NULL, &numOfPlatforms);
    if (status < 0){
        printf("FAIL to establish platform(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablised %u platform(s).\nInitializing platform(s)... ", numOfPlatforms);
    platforms = (cl_platform_id *)malloc(numOfPlatforms*sizeof(cl_platform_id));
    status = clGetPlatformIDs(numOfPlatforms, platforms, NULL); //
    if (status < 0){
        printf("FAIL to initialize platform(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nEstablishing devices... ");
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, NULL, NULL, &numOfDevices);
    if (status < 0){
        printf("FAIL to establish device(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nEstablished %u device(s).\nInitializing device(s)... ", numOfDevices);
    devices = (cl_device_id *)malloc(numOfDevices*sizeof(cl_device_id));
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL);
    if (status < 0){
        printf("FAIL to initialize devices(s)!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.");
    for (int i = 0; i < numOfDevices; i++){
        status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(caDeviceName), caDeviceName, NULL);
        if (status < 0){
            printf("FAIL to read device #%d name!> %d\n", i, status);
            system("PAUSE");
            exit(1);
        }
        printf("\nDevice #%d is \"%s\".", i, caDeviceName);
    }


    printf("\nCreating context... ");
    context = clCreateContext(NULL, numOfDevices, devices, NULL, NULL, &status);
    if (status < 0){
        printf("FAIL to create context!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nReading source code from file(s)... ");
    for (int i = 0; i < NUM_OF_FILES; i++){
        pProgramHandler = fopen(kcpaFileName[i], "r");
        if (pProgramHandler == NULL){
            printf("FAIL to open file \"%s\"!> %d\n", kcpaFileName[i], status);
            system("PAUSE");
            exit(1);
        }
        fseek(pProgramHandler, 0, SEEK_END);
        saProgramSize[i] = ftell(pProgramHandler);
        rewind(pProgramHandler);
        cpaProgramBuffer[i] = (char*)malloc(sizeof(char)*saProgramSize[i] + 1);
        fread(cpaProgramBuffer[i], sizeof(char), saProgramSize[i], pProgramHandler);
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
        fclose(pProgramHandler);
        for (int j = 0; j < saProgramSize[i]; j++){
            if ((char)cpaProgramBuffer[i][j] == (char)10){
                numOfEnters[i]++;
            }
        }
        saProgramSize[i] = saProgramSize[i] - numOfEnters[i];
        cpaProgramBuffer[i][saProgramSize[i]] = '\0';
    }
    printf("OK.\nCreating program from source code... ");
    program = clCreateProgramWithSource(context, NUM_OF_FILES, (const char **)cpaProgramBuffer, (const size_t *)saProgramSize, &status);
    if (status < 0){
        printf("FAIL to create program!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nBuilding program... ");
    status = clBuildProgram(program, 1, devices, kcaOptions, NULL, NULL);//, 
    if (status < 0){
        printf("FAIL to build program.\n...Genetating log...");
        for (int i = 0; i < NUM_OF_FILES; i++){
            printf("\nCode from file \"%s\":\n%s", kcpaFileName[i], cpaProgramBuffer[i]);
        }
        clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &sLogSize);
        cpProgramLog = (char*)malloc(sizeof(char)*sLogSize + 1);
        cpProgramLog[sLogSize] = '\0';
        clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sLogSize + 1, cpProgramLog, NULL);
        printf("\nLog length is %d.\nLog:\n%s\n> %d\n", sLogSize, cpProgramLog, status);
        system("PAUSE");
        exit(1);
    }

    printf("OK.\nDetermining number of kernels... ");
    status = clCreateKernelsInProgram(program, NULL, NULL, &numOfKernels);
    if (status < 0){
        printf("FAIL to determine number of kernels!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nDetermined %d kernel(s):", numOfKernels);
    kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*numOfKernels);
    clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);
    for (int i = 0; i < numOfKernels; i++){
        clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(caKernelName), caKernelName, NULL);
        printf("\nKernel \"%s\" indexed at %d.", caKernelName, i);
    }

    printf("\nCreating command queue... ");
    cmdQueue0 = clCreateCommandQueue(context, devices[0], NULL, &status);
    if (status < 0){
        printf("FAIL to create command queue!> %d\n", status);
        system("PAUSE");
        exit(1);
    }

    /*Data, buffers and subbuffers*/
    int iaArray1[5] = { 1, 2, 3, 4, 5 };
    printf("\nPrinting out the initial array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ", iaArray1[i]);
    }
    printf("\nCreating buffers for kernels[0]... ");
    int* ipaArray2 = (int*)malloc(5 * sizeof(int));
    cl_mem memObjArray1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(iaArray1), &iaArray1, &status);
    if (status < 0){
        printf("\nFAIL to create memObjArray1 buffer!> %d \n", status);
        system("PAUSE");
        exit(1);
    }
    cl_mem memObjArray2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(iaArray1), NULL, &status);
    if (status < 0){
        printf("\nFAIL to create memObjArray2 buffer!> %d \n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nSetting arguments for kernels[0]... ");
    status = clSetKernelArg(kernels[0], 0, sizeof(cl_mem), &memObjArray1);
    if (status < 0){
        printf("\nFAIL to set memObjArray1 argument at kernels[0]!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0], 1, sizeof(cl_mem), &memObjArray2);
    if (status < 0){
        printf("\nFAIL to set memObjArray2 argument at kernels[0]!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernels[0]... ");
    size_t tGlobal_item_size = 5;   //?
    size_t tLocal_item_size = 1;    //?
    status = clEnqueueNDRangeKernel(cmdQueue0, kernels[0], 1, NULL, &tGlobal_item_size, &tLocal_item_size, 0, NULL, NULL);
    if (status < 0){
        printf("\nFAIL to enqueue kernels[0] into cmdQueue0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjArray2, CL_TRUE, 0, 5 * sizeof(int), ipaArray2, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy results from device to host!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < 5; i++){
        printf("%d ", ipaArray2[i]);
    }

    printf("\nCreating subbuffer... ");
    int iQuantity = 2;
    int iShift = 2;
    typedef struct _cl_buffer_region{
        size_t size;
        size_t origin;
    } cl_buffer_region;
    cl_buffer_region stRegion;
    stRegion.size = iQuantity * sizeof(int);
    stRegion.origin = iShift * sizeof(int);
    cl_mem memObjSubArray = clCreateSubBuffer(memObjArray2, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &stRegion, &status);
    if (status < 0){
        printf("FAIL to create subbuffer!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory subbuffer to host array... ");
    int* ipaSubArray = (int*)malloc(iQuantity*sizeof(int));
    status = clEnqueueReadBuffer(cmdQueue0, memObjSubArray, CL_TRUE, 0, iQuantity*sizeof(int), ipaSubArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy results from device to host!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out the result array:\n");
    for (int i = 0; i < iQuantity; i++){
        printf("%d ", ipaSubArray[i]);
    }


    printf("OK.\n...Releasing resources... ");
    clReleaseMemObject(memObjArray1);
    clReleaseMemObject(memObjArray2);
    clReleaseMemObject(memObjSubArray);

    clReleaseCommandQueue(cmdQueue0);
    free(kernels);
    clReleaseProgram(program);
    for (int i = 0; i < NUM_OF_FILES; i++){
        free(cpaProgramBuffer[i]);
    }
    clReleaseContext(context);
    free(devices);
    free(platforms);
    printf("OK.\nEnd of program. Bey!\n");
    system("PAUSE");
}

程序执行日志文件

4

1 回答 1

0

似乎没有将数据从位于设备上的子缓冲内存对象传输到主机程序的内存的功能。但是子缓冲区数据缺乏可观察性的问题可以通过使用缓冲区和数据复制功能来解决clEnqueueCopyBuffer()。它的规格可以在这里找到。它的第四个输入参数指定数据从源缓冲区开始的偏移量。第五个输入参数指定目标缓冲区中数据的偏移量。第六个参数指定要复制的数据量。

函数调用示例clEnqueueCopyBuffer()如下:

cl_int status = clEnqueueCopyBuffer(cmdQueue0, memObjInput, memObjOutput, 
                                    sizeof(int)*tSrcBufOffset,
                                    sizeof(int)*tDestBufOffset,
                                    sizeof(int)*tQuantityToCopy, 
                                    NULL, NULL, NULL); 

例如,我编写了一个替换部分缓冲区数据的程序。对于原始整数数组{ 1, 2, 3, 4, 5 },在设备内存中创建一个缓冲区。从此缓冲区中检索第二个和第三个元素并显示在屏幕上:{2, 3}. 然后,在内核中,缓冲区的每个元素的值都增加了 2 { 3, 4, 5, 6, 7 }。内核执行的结果返回给宿主程序并显示出来。接下来是用存储在辅助缓冲区中的值替换缓冲区的第 3 和第 4 个元素{ 3, 4, 2, 3, 7 }

程序数据流的本质图形是这样的:

程序数据流本质

代码的主要部分如下所示。要执行它,只需将其插入先前给出的适当代码中。

...
    /*Data and buffers*/
    /*kernels[0]*/
    
    // two arrays and buffers creation
    int iaInputArray[] = { 1, 2, 3, 4, 5 };
    int iSizeOfArray = 5;
    int* ipaOutputArray = (int*)malloc(iSizeOfArray*sizeof(int));
    cl_mem memObjInput;
    cl_mem memObjOutput;
    cl_mem memObjSubBuffer;
    size_t tGlobal_item_size = iSizeOfArray;    //?
    size_t tLocal_item_size = 1;    //?
    size_t tSrcBufOffset;           //offset in source buffer
    size_t tDstBufOffset;           //offset in destination buffer
    size_t tNumbOfElementsToCopy=2; //number of elements to copy
    int* ipaSubArray = (int*)malloc(tNumbOfElementsToCopy*sizeof(int));

    printf("OK.\nPrinting out initial input array:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", iaInputArray[i]);
    }
    printf("\nCreating buffer memory objects... ");
    memObjInput = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(iaInputArray), &iaInputArray, &status);
    if (status < 0){
        printf("FAIL to create buffer for input data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    memObjOutput = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(iaInputArray), NULL, &status);
    if (status < 0){
        printf("FAIL to create buffer for output data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    memObjSubBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(ipaSubArray), NULL, &status);
    if (status < 0){
        printf("FAIL to create buffer for output data!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nCopying 2nd and 3rd elements of the initial array into sub-buffer... ");
    tSrcBufOffset = 1;
    tDstBufOffset = 0;
    status = clEnqueueCopyBuffer(cmdQueue0, memObjInput, memObjSubBuffer, sizeof(int)*tSrcBufOffset, sizeof(int)*tDstBufOffset, sizeof(int)*tNumbOfElementsToCopy, NULL, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy buffers!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring copied elements to host-program... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjSubBuffer, CL_TRUE, 0, tNumbOfElementsToCopy*sizeof(int), ipaSubArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out copied elements:\n");
    for (int i = 0; i < tNumbOfElementsToCopy; i++){
        printf("%d ", ipaSubArray[i]);
    }
    printf("\nSetting kernel arguments... ");
    status = clSetKernelArg(kernels[0], 0, sizeof(cl_mem), &memObjInput);
    if (status < 0){
        printf("FAIL to set kernel argument #0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    status = clSetKernelArg(kernels[0], 1, sizeof(cl_mem), &memObjOutput);
    if (status < 0){
        printf("FAIL to set kernel argument #1!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nExecuting kernel... ");
    status = clEnqueueNDRangeKernel(cmdQueue0, kernels[0], 1, NULL, &tGlobal_item_size, &tLocal_item_size, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to enqueue kernels[0] into cmdQueue0!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nReading results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjOutput, CL_TRUE, 0, iSizeOfArray*sizeof(int), ipaOutputArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out data obtained from kernel:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", ipaOutputArray[i]);
    }
    printf("\nChanging 3rd and 4th elements of data in output buffer... ");
    tSrcBufOffset = 0;
    tDstBufOffset = 2;
    status = clEnqueueCopyBuffer(cmdQueue0, memObjSubBuffer, memObjOutput, sizeof(int)*tSrcBufOffset, sizeof(int)*tDstBufOffset, sizeof(int)*tNumbOfElementsToCopy, NULL, NULL, NULL);
    if (status < 0){
        printf("FAIL to copy buffers!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nTransferring results from device memory buffer to host array... ");
    status = clEnqueueReadBuffer(cmdQueue0, memObjOutput, CL_TRUE, 0, iSizeOfArray*sizeof(int), ipaOutputArray, 0, NULL, NULL);
    if (status < 0){
        printf("FAIL to transfer data from device memory buffer to host array!> %d\n", status);
        system("PAUSE");
        exit(1);
    }
    printf("OK.\nPrinting out host array data:\n");
    for (int i = 0; i < iSizeOfArray; i++){
        printf("%d ", ipaOutputArray[i]);
    }
    printf("\n...Releasing resources... ");
...

程序执行打印画面: 程序执行日志

于 2021-05-20T14:12:38.777 回答