我目前正在学习 OpenCL,并且我的内核在直接访问全局数组时工作得很好,但是在私有内存上使用中间值时会给出错误的结果,例如,下面代码中的aux。
__kernel void kernel_cte(__global float *U0,__global float *U1,__constant float *VP0, uint stride, uint nnoi, __constant float *g_W, uint k0, uint k1, float FATMDFX, float FATMDFY, float FATMDFZ) {
uint index = get_global_id(1)*nnoi + get_global_id(0) + k0 * stride;
uint k;
float aux;
aux = U0[index+1];
for(k=k0;k<k1;++k) {
if(VP0[index] > 0.0f){
U1[index] = 2.0f * U0[index] - U1[index]
+ FATMDFX * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index - 6] + U0[index + 6])
+ g_W[5] * (U0[index - 5] + U0[index + 5])
+ g_W[4] * (U0[index - 4] + U0[index + 4])
+ g_W[3] * (U0[index - 3] + U0[index + 3])
+ g_W[2] * (U0[index - 2] + U0[index + 2])
+ g_W[1] * (U0[index - 1] + aux)
+ g_W[0] * U0[index]
)
+ FATMDFY * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index - 6 * nnoi] + U0[index + 6 * nnoi])
+ g_W[5] * (U0[index - 5 * nnoi] + U0[index + 5 * nnoi])
+ g_W[4] * (U0[index - 4 * nnoi] + U0[index + 4 * nnoi])
+ g_W[3] * (U0[index - 3 * nnoi] + U0[index + 3 * nnoi])
+ g_W[2] * (U0[index - 2 * nnoi] + U0[index + 2 * nnoi])
+ g_W[1] * (U0[index - nnoi] + U0[index + nnoi])
+ g_W[0] * U0[index]
)
+ FATMDFZ * VP0[index] * VP0[index] * (
+ g_W[6] * (U0[index + 6 * stride] + U0[index - 6 * stride])
+ g_W[5] * (U0[index + 5 * stride] + U0[index - 5 * stride])
+ g_W[4] * (U0[index + 4 * stride] + U0[index - 4 * stride])
+ g_W[3] * (U0[index + 3 * stride] + U0[index - 3 * stride])
+ g_W[2] * (U0[index + 2 * stride] + U0[index - 2 * stride])
+ g_W[1] * (U0[index + stride] + U0[index - stride])
+ g_W[0] * U0[index]
);
} // end if
index += stride;
}
}
我想使用向量来执行这些计算,但我不明白为什么当我执行 aux = U0[index+1] 时没有将正确的值复制到私有内存中。