当我尝试将 cl_float2 值的数组复制到常量内存时,它无法按 Nvidia 平台的预期工作,.y 部分似乎为零。对于 AMD 和 Intel 平台,我没有遇到这个问题。
// Host
c_Quadrature_Filter_1 = clCreateBuffer(context, CL_MEM_READ_ONLY, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), NULL, &createBufferErrorQuadratureFilter1);
cl_float2* filter_temp = (cl_float2*)malloc(FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2));
cl_float2 test;
test.s[0] = 3.0f;
test.s[1] = 13.0f;
for (int xx = 0; xx < FILTER_SIZE; xx++)
{
for (int yy = 0; yy < FILTER_SIZE; yy++)
{
filter_temp[xx + yy * FILTER_SIZE].s[0] = test.s[0];
filter_temp[xx + yy * FILTER_SIZE].s[1] = test.s[1];
}
}
clEnqueueWriteBuffer(commandQueue, c_Quadrature_Filter_1, CL_TRUE, 0, FILTER_SIZE * FILTER_SIZE * sizeof(cl_float2), filter_temp, 0, NULL, NULL);
free(filter_temp);
//Device
__kernel(__global float2* Filter_Response, __constant float2* c_Quadrature_Filter_1, __private int DATA_W, __private int DATA_H, __private int DATA_D)
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
Filter_Response[Calculate3DIndex(x,y,z,DATA_W,DATA_H)].y = c_Quadrature_Filter_1[0].y;
}