我正在 OpenCL 中编写一个程序,它接收两个点数组,并计算每个点的最近邻。
我有两个程序。其中之一将计算 4 个维度的距离,一个用于 6 个维度。它们如下:
4 个维度:
kernel void BruteForce(
global read_only float4* m,
global float4* y,
global write_only ushort* i,
read_only uint mx)
{
int index = get_global_id(0);
float4 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = -1;
int x = 0;
int mmx = mx;
for(x = 0; x < mmx; x++)
{
float dist = fast_distance(curY, m[x]);
if (dist < minDist)
{
minDist = dist;
minIdx = x;
}
}
i[index] = minIdx;
y[index] = minDist;
}
6 个维度:
kernel void BruteForce(
global read_only float8* m,
global float8* y,
global write_only ushort* i,
read_only uint mx)
{
int index = get_global_id(0);
float8 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = -1;
int x = 0;
int mmx = mx;
for(x = 0; x < mmx; x++)
{
float8 mx = m[x];
float d0 = mx.s0 - curY.s0;
float d1 = mx.s1 - curY.s1;
float d2 = mx.s2 - curY.s2;
float d3 = mx.s3 - curY.s3;
float d4 = mx.s4 - curY.s4;
float d5 = mx.s5 - curY.s5;
float dist = sqrt(d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3 + d4 * d4 + d5 * d5);
if (dist < minDist)
{
minDist = dist;
minIdx = index;
}
}
i[index] = minIdx;
y[index] = minDist;
}
我正在寻找针对 GPGPU 优化该程序的方法。我已经阅读了一些关于使用本地内存优化 GPGPU 的文章(包括http://www.macresearch.org/opencl_episode6,它带有源代码)。我试过应用它并想出了这个代码:
kernel void BruteForce(
global read_only float4* m,
global float4* y,
global write_only ushort* i,
__local float4 * shared)
{
int index = get_global_id(0);
int lsize = get_local_size(0);
int lid = get_local_id(0);
float4 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = 64000;
int x = 0;
for(x = 0; x < {0}; x += lsize)
{
if((x+lsize) > {0})
lsize = {0} - x;
if ( (x + lid) < {0})
{
shared[lid] = m[x + lid];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int x1 = 0; x1 < lsize; x1++)
{
float dist = distance(curY, shared[x1]);
if (dist < minDist)
{
minDist = dist;
minIdx = x + x1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
i[index] = minIdx;
y[index] = minDist;
}
我的“i”输出得到了垃圾结果(例如许多相同的值)。谁能指出我正确的方向?我将不胜感激任何有助于我改进此代码的答案,或者可能找到上面优化版本的问题。
非常感谢 Cauê