关于__shfl()
指令的延迟:
执行以下指令
c=__shfl(c, indi);
/*
where indi is any integer number(may be random (<32)),
and is different for different LaneID.
*/
与以下相比具有相同的延迟,可以说:
c=__shfl_down(c,1);
为了对 Robert 的回答提供“定量”的后续回答,让我们考虑一下 Mark Harris 使用 CUDA shuffle 操作的减少方法,详细信息请参见 Faster Parallel Reductions on Kepler。
在这种方法中,通过使用__shfl_down
. __shfl_xor
根据第 4 讲,使用变形减少的另一种方法:变形洗牌和减少/扫描操作。下面,我将报告实现这两种方法的完整代码。如果在 Kepler K20c 上进行测试,两者都需要减少元素0.044ms
数组。相关地,由于 Thrust 案例的执行时间是针对相同的测试N=200000
float
,这两种方法的性能都比 Thrust高两个数量级。reduce
1.06ms
这是完整的代码:
#include <thrust\device_vector.h>
#define warpSize 32
/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
//for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);
return val;
}
/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {
// --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
// 1024, so that there will be at most 32 warps per each block.
static __shared__ float shared[32];
int lane = threadIdx.x % warpSize; // Thread index within the warp
int wid = threadIdx.x / warpSize; // Warp ID
// --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
val = warpReduceSum(val);
// --- Only the threads with 0 index within the warp write the warp result to shared memory
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
// --- Wait for all warp reductions
__syncthreads();
// --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only
// the corresponding warp existed, otherwise the partial sum is set to zero.
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
// --- The first warp performs the final partial warp summation.
if (wid==0) val = warpReduceSum(val);
return val;
}
/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {
float sum = 0.f;
// --- Reduce multiple elements per thread.
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];
sum = blockReduceSum(sum);
if (threadIdx.x==0) out[blockIdx.x]=sum;
}
/********/
/* MAIN */
/********/
void main() {
const int N = 200000;
thrust::host_vector<float> h_out(N,0.f);
thrust::device_vector<float> d_in(N,3.f);
thrust::device_vector<float> d_out(N);
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// --- Performs the block reduction. It returns an output vector containig the block reductions as elements
cudaEventRecord(start, 0);
deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
// --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
// block reduction results of the previous step.
deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Shuffle - elapsed time: %3.5f ms \n", time);
h_out = d_out;
cudaEventRecord(start, 0);
float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Thrust - elapsed time: %3.5f ms \n", time);
printf("Shuffle result = %f\n",h_out[0]);
printf("Thrust result = %f\n",sum);
getchar();
}
所有 warp-shuffle 指令都具有相同的性能。