0

我已将文本存储在 char 数组 a[textLength] 中,并将模式存储在数组 b[patternLength] 中

cl_char *a = (cl_char *) malloc(textLength*sizeof(cl_char));

for(int i =0; i<textLength;i++)
{
    a[i]=text[i];
    }

// A buffer object is a handle to a region of memory
cl_mem a_buffer = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | // buffer object read only for kernel
                                 CL_MEM_COPY_HOST_PTR, // copy data from memory referenced
                                 // by host pointer
                                 textLength*sizeof(cl_char), // size in bytes of buffer object
                                 a, // host pointer
                                 NULL); // no error code returned

// for text and pattern kernal arguments
cl_char *b = (cl_char *) malloc(patternLength*sizeof(cl_char));

for(int i =0; i<patternLength;i++)
{
   b[i]=pattern[i];
}

// A buffer object is a handle to a region of memory
/*cl_mem b_buffer = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | // buffer object read only for kernel
                                 CL_MEM_COPY_HOST_PTR, // copy data from memory referenced
                                 // by host pointer
                                 patternLength*sizeof(cl_char), // size in bytes of buffer object
                                 b, // host pointer
                                 NULL); // no error code returned */
cl_mem b_buffer = NULL;


    clSetKernelArg(kernel, 0, sizeof(a_buffer), (void*) &a_buffer);
clSetKernelArg(kernel, 1, sizeof(cl_mem), NULL);
clSetKernelArg(kernel, n, sizeof(cl_mem), &b_buffer);
    size_t global_work_size = numberofWorkItem;
    cl_int error= clEnqueueNDRangeKernel(queue, kernel,
                       1, NULL, // global work items dimensions and offset
                       &global_work_size, // number of global work items
                       &patternLength, // number of local work items
                       0, NULL, // don't wait on any events to complete
                       &timeEvent); // no event object returned

 I have read that in clSetKernelArg, for __local indentifiers, the arg_value should be NULL. I have done that by doing b_buffer=NULL;

但是这样做会阻止 b_buffer 存储 b[] 的值(模式)我该怎么做?

另外,如果我没记错的话,local_work_size 不能大于 CL_DEVICE_MAX_WORK_ITEM_SIZES 给出的值。因为 local_work_size 受到底层设备/硬件的限制。另一方面,global_work_size 可以任意大。它必须是 local_work_size 的倍数吗???如果是,为什么?

4

2 回答 2

0

您的错误在 clSetKernelArg 行中:

//incorrect
clSetKernelArg(kernel, n, sizeof(cl_mem), &b_buffer);

//correct
clSetKernelArg(kernel, n, sizeof(cl_char)*patternLength, NULL);

内核执行后本地内存被清除,因此您将无法使用您的方法获取 b_buffer 的副本。此外,本地内存不是由主机分配的。您需要从全局参数复制以将其放入 LDS。

要复制本地数据,您需要传入全局 cl_mem 以及本地参数。复制可以在内核结束时完成,并使用 clEnqueueReadBuffer 返回主机。

更新

这是一个如何使用动态本地缓冲区的具体示例,并将其分配给全局缓冲区的内容。

__kernel void copyBufferExample(__global int* srcBuff, __local int* localBuff, const int copyCount)
{
    int lid = get_local_id(0);
    int ls = get_local_size(0);
    int i;

    for(i=lid; i<copyCount; i+=ls){
        localBuff[i] = srcBuff[i];
    }

    //use localBuff here
    //copy result back to global memory if needed
}
于 2012-07-12T11:36:17.913 回答
0

上面的代码不会并行复制...

这确实...

_内核 void copyBufferExample( _global int* srcBuff, __local int* localBuff, const int copyCount) {

int i = get_local_id(0);

if ( i < copyCount) localBuff[i] = srcBuff[i]; // 每个线程复制 1 个 int。不需要for循环

barrier(CLK_LOCAL_MEM_FENCE);  // synchronize all threads before using the local memory


//use localBuff here
//copy result back to global memory if needed

}

于 2012-07-17T03:17:36.943 回答