0

I have just started getting into OpenCL and going through the basics of writing a kernel code. I have written a kernel code for calculating shuffled keys for points array. So, for a number of points N, the shuffled keys are calculated in 3-bit fashion, where x-bit at depth d (0

xd = 0 if p.x < Cd.x  
xd = 1, otherwise

The Shuffled xyz key is given as:

x1y1z1x2y2z2...xDyDzD 

The Kernel code written is given below. The point is inputted in a column major format.

__constant float3 boundsOffsetTable[8] = {
              {-0.5,-0.5,-0.5},
              {+0.5,-0.5,-0.5},
              {-0.5,+0.5,-0.5},
              {-0.5,-0.5,+0.5},
              {+0.5,+0.5,-0.5},
              {+0.5,-0.5,+0.5},
              {-0.5,+0.5,+0.5},
              {+0.5,+0.5,+0.5}
};
uint setBit(uint x,unsigned char position)
{
uint mask = 1<<position;
return x|mask;
}

__kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt; 
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
{
    for(int i=0;i<8;i++)
    {
        newRadius = radius *0.5;
        newCenter = center + boundOffsetTable[i]*radius;
        if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
        {
            if(pt.x>newCenter.x) code = setBit(code,3*l);
            if(pt.y>newCenter.y) code = setBit(code,3*l+1);
            if(pt.z>newCenter.z) code = setBit(code,3*l+2);
        }
    }
}
}

It works but I just wanted to ask if I am missing something in the code and if there is an way to optimize the code.

4

1 回答 1

1

试试这个内核:

__kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt; 
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
{
    for(int i=0;i<8;i++)
    {
        newRadius = radius *0.5;
        newCenter = center + boundOffsetTable[i]*radius;
        if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
        {
            if(pt.x>newCenter.x) res = setBit(res,3*l);
            if(pt.y>newCenter.y) res = setBit(res,3*l+1);
            if(pt.z>newCenter.z) res = setBit(res,3*l+2);
        }
    }
}
//Save the result
code[i] = res;
}

优化规则:

  1. 避免使用全局内存(您直接使用全局内存中的“代码”,我更改了它),您现在应该会看到性能提高了 3 倍。
  2. 避免使用 Ifs,如果可能,请改用“select”。(参见 OpenCL 文档)
  3. 在内核中使用更多内存。您不需要在位级别进行操作。int 级别的操作会更好,并且可以避免对“setBit”的大量调用。然后你可以在最后构建你的结果。

另一个有趣的事情。如果您在 3D 级别操作,您可以使用 float3 变量并使用 OpenCL 运算符计算距离。这可以大大提高你的表现。BUt 还需要完全重写您的内核。

于 2013-05-15T12:17:01.587 回答