cuda - 使用 OpenCL 的累积数组求和

Question

我正在使用 OpenCL 计算 n 维点之间的欧几里得距离。我得到两个 n 维点列表，我应该返回一个数组，其中只包含从第一个表中的每个点到第二个表中的每个点的距离。

我的方法是进行常规的双循环（对于 Table1 中的每个点{对于 Table2 中的每个点 {...} }，然后对并行中的每一对点进行计算。

然后将欧几里得距离分成 3 部分： 1. 取点中每个维度之间的差 2. 平方差（仍然是每个维度） 3. 将 2 中获得的所有值相加。 4. 取点的平方根3中得到的值。（本例中省略了这一步。）

在我尝试累积所有差异的总和之前，一切都像魅力一样工作（即执行上述过程的第 3 步，下面代码的第 49 行）。

作为测试数据，我使用的是 DescriptorLists，每个点有 2 个点： DescriptorList1: 001,002,003,...,127,128; (p1) 129,130,131,...,255,256; (p2)

描述符列表2：000,001,002,...,126,127；(p1) 128,129,130,...,254,255；(p2)

所以生成的向量应该具有以下值：128、2064512、2130048、128 现在我得到的随机数随每次运行而变化。

对于我做错的事情，我很感激任何帮助或线索。希望一切都清楚我正在工作的场景。

#define BLOCK_SIZE 128

typedef struct
{
    //How large each point is
    int length;
    //How many points in every list
    int num_elements;
    //Pointer to the elements of the descriptor (stored as a raw array)
    __global float *elements;
} DescriptorList;

__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float As[BLOCK_SIZE])
{

    int gpidA = get_global_id(0);

    int featA = get_local_id(0);

    //temporary array  to store the difference between each dimension of 2 points
    float dif_acum[BLOCK_SIZE];

    //counter to track the iterations of the inner loop
    int loop = 0;

    //loop over all descriptors in A
    for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){

        //take the i-th descriptor. Returns a DescriptorList with just the i-th
        //descriptor in DescriptorList A
        DescriptorList tmpA = GetDescriptor(A, i);

        //copy the current descriptor to local memory.
        //returns one element of the only descriptor in DescriptorList tmpA
        //and index featA
        As[featA] = GetElement(tmpA, 0, featA);
        //wait for all the threads to finish copying before continuing
        barrier(CLK_LOCAL_MEM_FENCE);

        //loop over all the descriptors in B
        for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
            //take the difference of both current points
            dif_acum[featA] = As[featA]-B.elements[k*BLOCK_SIZE + featA];
            //wait again
            barrier(CLK_LOCAL_MEM_FENCE);
            //square value of the difference in dif_acum and store in C
            //which is where the results should be stored at the end.
            C[loop] = 0;
            C[loop] += dif_acum[featA]*dif_acum[featA];
            loop += 1;
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
}

score 7 · Accepted Answer

您的问题在于这些代码行：

C[loop] = 0;
C[loop] += dif_acum[featA]*dif_acum[featA];

工作组中的所有线程（嗯，实际上是所有线程，但稍后再说）都在尝试同时修改这个数组位置，而没有任何同步。有几个因素使这确实有问题：

工作组不能保证完全并行工作，这意味着对于某些线程 C[loop] = 0 可以在其他线程已经执行下一行之后调用
那些并行执行的都从 C[loop] 读取相同的值，用它们的增量修改它并尝试写回相同的地址。我不完全确定回写的结果是什么（我认为其中一个线程成功回写，而其他线程失败，但我不完全确定），但无论哪种方式都是错误的。

现在让我们来解决这个问题：虽然我们可以使用原子方法让它在全局内存上工作，但它不会很快，所以让我们在本地内存中累积：

local float* accum;
...
accum[featA] = dif_acum[featA]*dif_acum[featA];
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
{
    if ((featA % (2*i)) == 0)
        accum[featA] += accum[featA + i];
    barrier(CLK_LOCAL_MEM_FENCE);
}
if(featA == 0)
    C[loop] = accum[0];

当然，您可以为此重用其他本地缓冲区，但我认为这一点很清楚（顺便说一句：您确定 dif_acum 将在本地内存中创建吗，因为我想我在某处读到这不会放在本地内存中，这将使将 A 预加载到本地内存中毫无意义）。

关于此代码的其他几点：

您的代码似乎只适用于工作组（您既没有使用 groupid 也没有使用 global id 来查看要处理的项目），为了获得最佳性能，您可能希望使用更多。
可能是个人偏好，但对我来说，使用get_local_size(0)workgroupsize 似乎比使用 Define 更好（因为您可能在主机代码中更改它而没有意识到您应该将 opencl 代码更改为）
您的代码中的障碍都是不必要的，因为没有线程访问由另一个线程写入的本地内存中的元素。因此，您不需要为此使用本地内存。

考虑到最后一个项目符号，您可以简单地执行以下操作：

float As = GetElement(tmpA, 0, featA);
...
float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];

这将使代码（不考虑前两个项目符号）：

__kernel void CompareDescriptors_deb(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE])
{
   int gpidA = get_global_id(0);
   int featA = get_local_id(0);
   int loop = 0;
   for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){
       DescriptorList tmpA = GetDescriptor(A, i);
       float As = GetElement(tmpA, 0, featA);
       for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
           float dif_acum = As-B.elements[k*BLOCK_SIZE + featA];

           accum[featA] = dif_acum[featA]*dif_acum[featA];
           barrier(CLK_LOCAL_MEM_FENCE);
           for(unsigned int i = 1; i < BLOCKSIZE; i *= 2)
           {
              if ((featA % (2*i)) == 0)
                 accum[featA] += accum[featA + i];
              barrier(CLK_LOCAL_MEM_FENCE);
           }
           if(featA == 0)
              C[loop] = accum[0];
           barrier(CLK_LOCAL_MEM_FENCE);

           loop += 1;
        }
    }
}

score 3 · Accepted Answer

感谢 Grizzly，我现在有了一个可以工作的内核。根据灰熊的回答，我需要修改一些东西：

我在例程的开头添加了一个 IF 语句，以丢弃所有不会引用我正在使用的数组中任何有效位置的线程。

if(featA > BLOCK_SIZE){return;}

将第一个描述符复制到本地（共享）内存（ig 到 Bs）时，必须指定索引，因为函数 GetElement 每次调用只返回一个元素（我在我的问题上跳过了那个）。

Bs[featA] = GetElement(tmpA, 0, featA);

然后，SCAN 循环需要稍作调整，因为缓冲区在每次迭代后都会被覆盖，并且无法控制哪个线程首先访问数据。这就是为什么我要“回收” dif_acum 缓冲区来存储部分结果，这样可以防止整个循环中出现不一致。

dif_acum[featA] = accum[featA];

SCAN 循环中还有一些边界控制，以可靠地确定要添加在一起的项。

if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){

最后，我认为在最后一个 IF 语句中包含循环变量增量是有意义的，这样只有一个线程会修改它。

if(featA == 0){
    C[loop] = accum[BLOCK_SIZE-1];
    loop += 1;
}

而已。我仍然想知道如何使用 group_size 来消除 BLOCK_SIZE 定义，以及我是否可以在线程使用方面采取更好的策略。

所以代码最终看起来像这样：

__kernel void CompareDescriptors(__global float *C, DescriptorList A, DescriptorList B, int elements, __local float accum[BLOCK_SIZE], __local float Bs[BLOCK_SIZE])
{

    int gpidA = get_global_id(0);
    int featA = get_local_id(0);

    //global counter to store final differences
    int loop = 0;

    //auxiliary buffer to store temporary data
    local float dif_acum[BLOCK_SIZE];

    //discard the threads that are not going to be used.
    if(featA > BLOCK_SIZE){
        return;
    }

    //loop over all descriptors in A
    for (int i = 0; i < A.num_elements/BLOCK_SIZE; i++){

        //take the gpidA-th descriptor
        DescriptorList tmpA = GetDescriptor(A, i);

        //copy the current descriptor to local memory
        Bs[featA] = GetElement(tmpA, 0, featA);

        //loop over all the descriptors in B
        for (int k = 0; k < B.num_elements/BLOCK_SIZE; k++){
            //take the difference of both current descriptors
            dif_acum[featA] = Bs[featA]-B.elements[k*BLOCK_SIZE + featA];

            //square the values in dif_acum
            accum[featA] = dif_acum[featA]*dif_acum[featA];
            barrier(CLK_LOCAL_MEM_FENCE);

            //copy the values of accum to keep consistency once the scan procedure starts. Mostly important for the first element. Two buffers are necesarry because the scan procedure would override values that are then further read if one buffer is being used instead.
            dif_acum[featA] = accum[featA];

            //Compute the accumulated sum (a.k.a. scan)
            for(int j = 1; j < BLOCK_SIZE; j *= 2){
                int next_addend = featA-(j/2);
                if (featA >= j && next_addend >= 0 && next_addend < BLOCK_SIZE){
                    dif_acum[featA] = accum[featA] + accum[next_addend];
                }
                barrier(CLK_LOCAL_MEM_FENCE);

                //copy As to accum
                accum[featA] = GetElementArray(dif_acum, BLOCK_SIZE, featA); 
                barrier(CLK_LOCAL_MEM_FENCE);
            }

            //tell one of the threads to write the result of the scan in the array containing the results.
            if(featA == 0){
                C[loop] = accum[BLOCK_SIZE-1];
                loop += 1;
            }
            barrier(CLK_LOCAL_MEM_FENCE);

        }
    }
}

cuda - 使用 OpenCL 的累积数组求和

2 回答 2

Related

Reference