我正在测试将原子加法操作插入优化数组缩减内核的效果,以测量性能影响。我无法理解结果。我测试了五种不同的内核:
0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf
2 - kernel 1 with atomic warp-synchronous reduction
3 - kernel 2 with completely atomic reduction within all shared memory
4 - kernel 3 with completely atomic reduction
我在足够大的元素样本上使用的设备的平均还原时间:
0 - 0.00103s
1 - 0.00103s
2 - 0.00103s
3 - 0.00103s
4 - 0.00117s
为什么原子操作似乎对内核没有任何影响,2
或者对内核3
有一些小的影响4
?
这是完整的代码。相关的内核是:
/////////////////
// warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
* to prevent instruction reordering for non-atomic
* operations */
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
////////////////////////
// atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
* to serialize computation */
template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]);
if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]);
if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}
////////////////////////
// reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
* cuda/samples/6_Advanced/reduction/reduction_kernel.cu */
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
int sum = 0;
// reduce multiple elements per thread
while (i < n) {
sum += g_idata[i];
// check bounds
if (nIsPow2 || i + blockSize < n)
sum += g_idata[i + blockSize];
i += gridSize;
}
// local sum -> shared memory
sdata[tid] = sum;
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] = sum = sum + sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] = sum = sum + sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] = sum = sum + sdata[tid + 64];
__syncthreads();
}
if (tid < 32) {
// warp-synchronous reduction
// volatile memory stores won't be reordered by compiler
volatile int *smem = sdata;
if (blockSize >= 64)
smem[tid] = sum = sum + smem[tid + 32];
if (blockSize >= 32)
smem[tid] = sum = sum + smem[tid + 16];
if (blockSize >= 16)
smem[tid] = sum = sum + smem[tid + 8];
if (blockSize >= 8)
smem[tid] = sum = sum + smem[tid + 4];
if (blockSize >= 4)
smem[tid] = sum = sum + smem[tid + 2];
if (blockSize >= 2)
smem[tid] = sum = sum + smem[tid + 1];
}
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 1 //
/////////////////////////
/* fastest reduction alrogithm described in
* cuda/samples/6_Advanced/reduction/doc/reduction.pdf */
template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 2 //
/////////////////////////
/* reduction kernel 1 executed
* with atomic warp-synchronous addition */
template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 3 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 4 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}