CUDA 能足够快地计算出这样一个函数的值吗?
许多成本泛函以一定数量的项的总和的形式表示。例子是:
球体功能
罗森布洛克函数
Styblinski-Tang 函数
在所有这些情况下,成本函数的评估可以通过缩减来执行,或者更好的是,先转换然后缩减。
CUDA Thrustthrust::transform_reduce
肯定可以服务于范围,但您当然可以设置自己的转换 + 缩减例程。
下面,我将提供一个示例,说明如何使用 CUDA Thrust 或 CUDA 示例提供的缩减例程的自定义版本来计算 Rosenbrock 函数。在后一种情况下,如果定义了关键字,或者在定制例程的编译单元中定义并编译了转换函数,则将指向__device__
转换函数的指针传递给定制函数。transform_reduce
EXTERNAL
transform_reduce
在非外部情况下,Kepler K20c 卡的一些性能结果:
N = 90000 Thrust = 0.055ms Customized = 0.059ms
N = 900000 Thrust = 0.67ms Customized = 0.14ms
N = 9000000 Thrust = 0.85ms Customized = 0.87ms
这是代码。有关计时功能,请参阅OrangeOwlSolutions/Timing和OrangeOwlSolutions/CUDA_Utilities github 项目。
请注意,需要单独编译。
内核.cu
// --- Requires separate compilation
#include <stdio.h>
#include <thrust/device_vector.h>
#include "transform_reduce.cuh"
#include "Utilities.cuh"
#include "TimingCPU.h"
#include "TimingGPU.cuh"
/***************************************/
/* COST FUNCTION - GPU/CUSTOMIZED CASE */
/***************************************/
// --- Transformation function
__device__ __forceinline__ float transformation(const float * __restrict__ x, const int i) { return (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) ; }
// --- Device-side function pointer
__device__ pointFunction_t dev_pfunc = transformation;
/***********************************/
/* COST FUNCTION - GPU/THRUST CASE */
/***********************************/
struct CostFunctionStructGPU{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a) {
float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
float temp2 = (thrust::get<0>(a) - 1.f);
return 100.f * temp1 * temp1 + temp2 * temp2;
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 90000000;
float *x = (float *)malloc(N * sizeof(float));
for (int i=0; i<N; i++) x[i] = 3.f;
float *d_x; gpuErrchk(cudaMalloc((void**)&d_x, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice));
/************************************************/
/* "CUSTOMIZED" DEVICE-SIDE TRANSFORM REDUCTION */
/************************************************/
float customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc);
TimingGPU timerGPU;
timerGPU.StartCounter();
customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc);
printf("Timing for 'customized', device-side transform reduction = %f\n", timerGPU.GetCounter());
printf("Result for 'customized', device-side transform reduction = %f\n", customizedDeviceResult);
printf("\n\n");
/************************************************/
/* THRUST-BASED DEVICE-SIDE TRANSFORM REDUCTION */
/************************************************/
thrust::device_vector<float> d_vec(N,3.f);
timerGPU.StartCounter();
float ThrustResult = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin(), d_vec.begin() + 1)), thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin() + N - 1, d_vec.begin() + N)), CostFunctionStructGPU(), 0.f, thrust::plus<float>());
printf("Timing for Thrust-based, device-side transform reduction = %f\n", timerGPU.GetCounter());
printf("Result for Thrust-based, device-side transform reduction = %f\n", ThrustResult);
printf("\n\n");
/*********************************/
/* HOST-SIDE TRANSFORM REDUCTION */
/*********************************/
// thrust::host_vector<float> h_vec(d_vec);
//sum_host = sum_host + transformation(thrust::raw_pointer_cast(h_vec.data()), i);
TimingCPU timerCPU;
timerCPU.StartCounter();
float sum_host = 0.f;
for (int i=0; i<N-1; i++) {
float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f));
sum_host = sum_host + temp;
//printf("%i %f %f\n", i, temp, sum_host);
}
printf("Timing for host-side transform reduction = %f\n", timerCPU.GetCounter());
printf("Result for host-side transform reduction = %f\n", sum_host);
printf("\n\n");
sum_host = 0.f;
float c = 0.f;
for (int i=0; i<N-1; i++) {
float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) - c;
float t = sum_host + temp;
c = (t - sum_host) - temp;
sum_host = t;
}
printf("Result for host-side transform reduction = %f\n", sum_host);
// cudaDeviceReset();
}
transform_reduce.cuh
#ifndef TRANSFORM_REDUCE_CUH
#define TRANSFORM_REDUCE_CUH
// --- Function pointer type
// --- Complete with your own favourite instantiations
typedef float(*pointFunction_t)(const float * __restrict__, const int);
template <class T> T transform_reduce(T *, unsigned int, pointFunction_t *);
#endif
transform_reduce.cu
#include <stdio.h>
#include "Utilities.cuh"
#include "transform_reduce.cuh"
#define BLOCKSIZE 512
#define warpSize 32
// --- Host-side function pointer
pointFunction_t h_pfunc;
// --- Uncomment if you want to apply CUDA error checking to the kernel launches
//#define DEBUG
//#define EXTERNAL
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
/*************************************/
/* CHECK IF A NUMBER IS A POWER OF 2 */
/*************************************/
// --- Note: although x = 1 is a power of 2 (1 = 2^0), this routine returns 0 for x == 1.
bool isPow2(unsigned int x) {
if (x == 1) return 0;
else return ((x&(x-1))==0);
}
/***************************/
/* TRANSFORMATION FUNCTION */
/***************************/
template <class T>
__host__ __device__ __forceinline__ T transformation(const T * __restrict__ x, const int i) { return ((T)100 * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - (T)1) * (x[i] - (T)1)) ; }
/********************/
/* REDUCTION KERNEL */
/********************/
/*
This version adds multiple elements per thread sequentially. This reduces the overall
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
(Brent's Theorem optimization)
Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void reductionKernel(T *g_idata, T *g_odata, unsigned int N, pointFunction_t pPointTransformation)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension
unsigned int gridSize = blockSize*2*gridDim.x;
// --- Performs the first level of reduction in registers when reading from global memory on multiple elements per thread.
// More blocks will result in a larger gridSize and therefore fewer elements per thread
T mySum = 0;
while (i < N) {
#ifdef EXTERNAL
mySum += (*pPointTransformation)(g_idata, i);
#else
mySum += transformation(g_idata, i);
#endif
// --- Ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < N)
#ifdef EXTERNAL
mySum += (*pPointTransformation)(g_idata, i+blockSize);
#else
mySum += transformation(g_idata, i+blockSize);
#endif
i += gridSize; }
// --- Each thread puts its local sum into shared memory
sdata[tid] = mySum;
__syncthreads();
// --- Reduction in shared memory. Fully unrolled loop.
if ((blockSize >= 512) && (tid < 256)) sdata[tid] = mySum = mySum + sdata[tid + 256];
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) sdata[tid] = mySum = mySum + sdata[tid + 128];
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) sdata[tid] = mySum = mySum + sdata[tid + 64];
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
// --- Single warp reduction by shuffle operations
if ( tid < 32 )
{
// --- Last iteration removed from the for loop, but needed for shuffle reduction
mySum += sdata[tid + 32];
// --- Reduce final warp using shuffle
for (int offset = warpSize/2; offset > 0; offset /= 2) mySum += __shfl_down(mySum, offset);
//for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor(mySum, i);
}
#else
// --- Reduction within a single warp. Fully unrolled loop.
if ((blockSize >= 64) && (tid < 32)) sdata[tid] = mySum = mySum + sdata[tid + 32];
__syncthreads();
if ((blockSize >= 32) && (tid < 16)) sdata[tid] = mySum = mySum + sdata[tid + 16];
__syncthreads();
if ((blockSize >= 16) && (tid < 8)) sdata[tid] = mySum = mySum + sdata[tid + 8];
__syncthreads();
if ((blockSize >= 8) && (tid < 4)) sdata[tid] = mySum = mySum + sdata[tid + 4];
__syncthreads();
if ((blockSize >= 4) && (tid < 2)) sdata[tid] = mySum = mySum + sdata[tid + 2];
__syncthreads();
if ((blockSize >= 2) && ( tid < 1)) sdata[tid] = mySum = mySum + sdata[tid + 1];
__syncthreads();
#endif
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
/******************************/
/* REDUCTION WRAPPER FUNCTION */
/******************************/
template <class T>
T transform_reduce_inner(T *g_idata, unsigned int N, pointFunction_t h_pfunc) {
// --- Reduction parameters
const int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
const int NumBlocks = (N + NumThreads - 1) / NumThreads;
const int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(T) : NumThreads * sizeof(T);
// --- Device memory space where storing the partial reduction results
T *g_odata; gpuErrchk(cudaMalloc((void**)&g_odata, NumBlocks * sizeof(T)));
if (isPow2(N)) {
switch (NumThreads) {
case 512: reductionKernel<T, 512, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 256: reductionKernel<T, 256, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 128: reductionKernel<T, 128, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 64: reductionKernel<T, 64, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 32: reductionKernel<T, 32, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 16: reductionKernel<T, 16, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 8: reductionKernel<T, 8, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 4: reductionKernel<T, 4, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 2: reductionKernel<T, 2, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 1: reductionKernel<T, 1, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
}
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
else {
switch (NumThreads) {
case 512: reductionKernel<T, 512, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 256: reductionKernel<T, 256, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 128: reductionKernel<T, 128, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 64: reductionKernel<T, 64, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 32: reductionKernel<T, 32, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 16: reductionKernel<T, 16, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 8: reductionKernel<T, 8, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 4: reductionKernel<T, 4, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 2: reductionKernel<T, 2, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 1: reductionKernel<T, 1, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
}
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
T *host_vector = (T *)malloc(NumBlocks * sizeof(T));
gpuErrchk(cudaMemcpy(host_vector, g_odata, NumBlocks * sizeof(T), cudaMemcpyDeviceToHost));
T sum_transformReduce = (T)0;
for (int i=0; i<NumBlocks; i++) sum_transformReduce = sum_transformReduce + host_vector[i];
return sum_transformReduce;
}
template <class T>
T transform_reduce(T *g_idata, unsigned int N, pointFunction_t *dev_pfunc) {
#ifdef EXTERNAL
gpuErrchk(cudaMemcpyFromSymbol(&h_pfunc, *dev_pfunc, sizeof(pointFunction_t)));
#endif
T customizedDeviceResult = transform_reduce_inner(g_idata, N, h_pfunc);
return customizedDeviceResult;
}
// --- Complete with your own favourite instantiations
template float transform_reduce(float *, unsigned int, pointFunction_t *);