cuda - CUDA：如何运行 Mark Harris 的 NVIDIA 论文中描述的求和并行归约代码？

Question

尽管我了解本文中描述的并行归约背后的逻辑，但对于输入数组为 1 的简单示例，我似乎无法运行它size。

这是我到目前为止所取得的成就。请记住，我正在使用推力库来管理输入和输出数据。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>

using namespace std;


__global__ void reduce0(int *g_idata, int *g_odata){

   extern __shared__ int sdata[];

  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  sdata[tid] = g_idata[i];

  __syncthreads();

  for(unsigned int s=1; s < blockDim.x; s *= 2) {
     if (tid % (2*s) == 0) {
        sdata[tid] += sdata[tid + s];
     }
  __syncthreads();
 }
 if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}


int main(void){

  int size = 10;
  thrust::host_vector<int> data_h_i(size, 1);

  //initialize the data, all values will be 1 
  //so the final sum will be equal to 10

  int threadsPerBlock = 256;
  int totalBlocks = size/threadsPerBlock + 1;

  dim3 dimGrid(totalBlocks,1,1);
  dim3 dimBlock(threadsPerBlock, 1, 1);

  thrust::device_vector<int> data_v_i = data_h_i;
  thrust::device_vector<int> data_v_o(size);

  int* output = thrust::raw_pointer_cast(data_v_o.data());
  int* input = thrust::raw_pointer_cast(data_v_i.data());

  reduce0<<<dimGrid, dimBlock>>>(input, output);

  data_v_i.clear();
  data_v_i.shrink_to_fit();

  thrust::host_vector<int> data_h_o = data_v_o;

  data_v_o.clear();
  data_v_o.shrink_to_fit();

  cout<<data_h_o[0]<<endl;


  return 0;

}

代码很简单，我创建一个host_vector大小size并将所有值初始化为 1。

然后我说我们每个块需要 256 个线程，并动态找到我的示例所需的块数量。

为了简单起见，我只创建了一个包含 10 个值的数组，这意味着我们只需要一个块。因此，一次内核调用就足以产生最终结果。

我的问题如下：

问题 1

编译上述示例 ( nvcc -O3 reduction.cu -arch=sm_21) 并输入后，./a.out我收到以下消息：

terminate called after throwing an instance of 'thrust::system::system_error' what(): unspecified launch failure

我不确定这里发生了什么，但在我看来，错误来自这条线

sdata[tid] = g_idata[i]

内核是论文中描述的内核的精确副本，因此我不确定需要进行哪些更改才能解决此问题。

问题2

如果我们解决第一个问题，我们如何才能使上述代码适用于任意大小的输入数组？例如，如果我们size的块数超过 256，那么我们至少需要两个块，因此每个块将给出一个输出，然后必须将其与其他块的输出组合。在论文中它说我们需要多次调用内核，但是我不确定如何动态完成。

先感谢您

EDIT1：对于问题 1，我似乎没有为共享内存正确分配内存。像这样调用内核：reduce0<<<dimGrid, dimBlock, size*sizeof(int)>>>(input, output);并检查是否tid超出范围。使代码正常工作。新内核如下：

__global__ void reduce0(int *g_idata, int *g_odata, int size){

   extern __shared__ int sdata[];

   unsigned int tid = threadIdx.x;
   unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

   if(tid<size){

     sdata[tid] = g_idata[i];
     __syncthreads();

    for(unsigned int s=1; s < size; s *= 2) {
        if (tid % (2*s) == 0) {
         sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
     }

   if (tid == 0) g_odata[blockIdx.x] = sdata[0];

  }

}

不过，我仍然不确定问题 2。

score 5 · Accepted Answer

问题 1

您的内核正在使用动态分配的共享内存：

extern __shared__ int sdata[];
...
sdata[tid] = g_idata[i];

但是您没有在内核调用中分配任何动态共享内存：

reduce0<<<dimGrid, dimBlock>>>(input, output);
                           ^
                           |
                           missing shared memory parameter.

因此，当您尝试访问共享内存时，您会遇到内核错误。顺便说一句，您仍然可以对内核调用进行cuda 错误检查（即使您在其他地方使用推力）。

问题2

问题 2 在 Mark 的论文中得到了很好的回答。您可以在幻灯片 9 的底部看到每个块将其部分结果写入全局内存 (g_odata[]) 中的一个数组，该数组存储每个块的一个结果。然后我们简单地启动另一个本质上相同类型的内核，它在 g_odata[] 上运行，而不是原始输入数据。我们可以连续执行这个过程，直到我们的部分结果（例如 g_odata[]）只包含 256 个结果，或者我们在一个线程块中启动了多少个线程。然后我们可以将最终结果与单个线程块相加并产生单个答案值。

示例在此处的 cuda 示例代码中给出。

这是您的代码的编辑版本，它显示了如何依次调用两个内核来处理更大的大小。我不认为这是简化编程的典范，只是您为说明该概念而编写的内容的简单扩展。请注意，整个内核和主要代码都进行了各种更改，以方便使用内核处理更大的数据大小。此方法仍然不会扩展到超过 (threadsPerBlock ^2) 的数据大小，但它再次说明了按顺序调用多个内核以对部分结果求和的概念，对代码的修改最少。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>

using namespace std;


__global__ void reduce0(int *g_idata, int *g_odata, int size){

   extern __shared__ int sdata[];

   unsigned int tid = threadIdx.x;
   unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
   sdata[tid] = 0;
   if(i<size)
     sdata[tid] = g_idata[i];
   __syncthreads();

  for(unsigned int s=1; s < blockDim.x; s *= 2) {
        if (tid % (2*s) == 0) {
         sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
     }

   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

int main(void){

  int size = 40000;
  thrust::host_vector<int> data_h_i(size, 1);

  //initialize the data, all values will be 1
  //so the final sum will be equal to size

  int threadsPerBlock = 256;
  int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;

  thrust::device_vector<int> data_v_i = data_h_i;
  thrust::device_vector<int> data_v_o(totalBlocks);

  int* output = thrust::raw_pointer_cast(data_v_o.data());
  int* input = thrust::raw_pointer_cast(data_v_i.data());
  reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);

  reduce0<<<1, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, totalBlocks);
  data_v_o[0] = data_v_i[0];
  data_v_i.clear();
  data_v_i.shrink_to_fit();

  thrust::host_vector<int> data_h_o = data_v_o;

  data_v_o.clear();
  data_v_o.shrink_to_fit();

  cout<<data_h_o[0]<<endl;


  return 0;

}

score 2 · Accepted Answer

在修改Robert Crovella编写的代码以回答我的问题后，这是支持任意数量输入值的最终版本。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <ctime>
#include <sys/time.h>
#include <sstream>
#include <string>
#include <fstream>

using namespace std;


__global__ void reduce0(int *g_idata, int *g_odata, int size){

   extern __shared__ int sdata[];

   unsigned int tid = threadIdx.x;
   unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
   sdata[tid] = 0;
   if(i<size)
     sdata[tid] = g_idata[i];
   __syncthreads();

   for(unsigned int s=1; s < blockDim.x; s *= 2) {
        if (tid % (2*s) == 0) {
         sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
     }

   if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

int main(void){

  int size = 939289;
  thrust::host_vector<int> data_h_i(size, 1);

  //initialize the data, all values will be 1
  //so the final sum will be equal to size

  int threadsPerBlock = 256;
  int totalBlocks = (size+(threadsPerBlock-1))/threadsPerBlock;

  thrust::device_vector<int> data_v_i = data_h_i;
  thrust::device_vector<int> data_v_o(totalBlocks);

  int* output = thrust::raw_pointer_cast(data_v_o.data());
  int* input = thrust::raw_pointer_cast(data_v_i.data());

  bool turn = true;

  while(true){

    if(turn){

      reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(input, output, size);
      turn = false;

    }
    else{

      reduce0<<<totalBlocks, threadsPerBlock, threadsPerBlock*sizeof(int)>>>(output, input, size);
      turn = true;

    }

    if(totalBlocks == 1) break;

    size = totalBlocks;
    totalBlocks = ceil((double)totalBlocks/threadsPerBlock);

  }

  thrust::host_vector<int> data_h_o;

  if(turn)
    data_h_o = data_v_i;
  else
    data_h_o = data_v_o;

  data_v_i.clear();
  data_v_i.shrink_to_fit();

  data_v_o.clear();
  data_v_o.shrink_to_fit();

  cout<<data_h_o[0]<<endl;


  return 0;

}

cuda - CUDA：如何运行 Mark Harris 的 NVIDIA 论文中描述的求和并行归约代码？

2 回答 2

Related

Reference