我正在尝试基于Cyclic Reduction
我的GTS450
.
本文说明了循环减少
Y. Zhang、J. Cohen、JD Owens,“GPU 上的快速三对角求解器”
但是,无论我做什么,我的 CUDA 代码都比顺序代码慢得多。我的总分结果512 x 512
是7ms
,但是在我的 i7 3.4GHz 上它是5ms
。GPU没有加速!
哪个可能是问题?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
我通过 启动了这个内核cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x)
,并达到了100%
设备占用率。这个结果让我困惑了好几天。
我的代码有一个改进版本:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
0.73ms
在Quadro k4000
for系统上速度提高了512 x 512
,但是上述论文中的代码0.5ms
在GTX280
.