我想用 CUDA 语言编写一个电磁二维有限差分时域 (FDTD) 代码。更新磁场的C代码如下
// --- Update for Hy and Hx
for(int i=n1; i<=n2; i++)
for(int j=n11; j<=n21; j++){
Hy[i*ydim+j]=A[i*ydim+j]*Hy[i*ydim+j]+B[i*ydim+j]*(Ezx[(i+1)*ydim+j]-Ezx[i*ydim+j]+Ezy[(i+1)*ydim+j]-Ezy[i*ydim+j]);
Hx[i*ydim+j]=G[i*ydim+j]*Hx[i*ydim+j]-H[i*ydim+j]*(Ezx[i*ydim+j+1]-Ezx[i*ydim+j]+Ezy[i*ydim+j+1]-Ezy[i*ydim+j]);
}
}
我的第一次并行化尝试是以下内核:
__global__ void H_update_kernel(double* Hx_h, double* Hy_h, double* Ezx_h, double* Ezy_h, double* A_h, double* B_h,double* G_h, double* H_h, int n1, int n2, int n11, int n21)
{
int idx = blockIdx.x*BLOCK_SIZE_X + threadIdx.x;
int idy = blockIdx.y*BLOCK_SIZE_Y + threadIdx.y;
if ((idx <= n2 && idx >= n1)&&(idy <= n21 && idy >= n11)) {
Hy_h[idx*ydim+idy]=A_h[idx*ydim+idy]*Hy_h[idx*ydim+idy]+B_h[idx*ydim+idy]*(Ezx_h[(idx+1)*ydim+idy]-Ezx_h[idx*ydim+idy]+Ezy_h[(idx+1)*ydim+idy]-Ezy_h[idx*ydim+idy]);
Hx_h[idx*ydim+idy]=G_h[idx*ydim+idy]*Hx_h[idx*ydim+idy]-H_h[idx*ydim+idy]*(Ezx_h[idx*ydim+idy+1]-Ezx_h[idx*ydim+idy]+Ezy_h[idx*ydim+idy+1]-Ezy_h[idx*ydim+idy]); }
}
但是,通过也使用 Visual Profiler,我对这个解决方案不满意,原因有两个:1)内存访问的合并很差;2) 未使用共享内存。
然后我决定使用以下解决方案
__global__ void H_update_kernel(double* Hx_h, double* Hy_h, double* Ezx_h, double* Ezy_h, double* A_h, double* B_h,double* G_h, double* H_h, int n1, int n2, int n11, int n21)
{
int i = threadIdx.x;
int j = threadIdx.y;
int idx = blockIdx.x*BLOCK_SIZE_X + threadIdx.x;
int idy = blockIdx.y*BLOCK_SIZE_Y + threadIdx.y;
int index1 = j*BLOCK_SIZE_Y+i;
int i1 = (index1)%(BLOCK_SIZE_X+1);
int j1 = (index1)/(BLOCK_SIZE_Y+1);
int i2 = (BLOCK_SIZE_X*BLOCK_SIZE_Y+index1)%(BLOCK_SIZE_X+1);
int j2 = (BLOCK_SIZE_X*BLOCK_SIZE_Y+index1)/(BLOCK_SIZE_Y+1);
__shared__ double Ezx_h_shared[BLOCK_SIZE_X+1][BLOCK_SIZE_Y+1];
__shared__ double Ezy_h_shared[BLOCK_SIZE_X+1][BLOCK_SIZE_Y+1];
if (((blockIdx.x*BLOCK_SIZE_X+i1)<xdim)&&((blockIdx.y*BLOCK_SIZE_Y+j1)<ydim))
Ezx_h_shared[i1][j1]=Ezx_h[(blockIdx.x*BLOCK_SIZE_X+i1)*ydim+(blockIdx.y*BLOCK_SIZE_Y+j1)];
if (((i2<(BLOCK_SIZE_X+1))&&(j2<(BLOCK_SIZE_Y+1)))&&(((blockIdx.x*BLOCK_SIZE_X+i2)<xdim)&&((blockIdx.y*BLOCK_SIZE_Y+j2)<ydim)))
Ezx_h_shared[i2][j2]=Ezx_h[(blockIdx.x*BLOCK_SIZE_X+i2)*xdim+(blockIdx.y*BLOCK_SIZE_Y+j2)];
__syncthreads();
if ((idx <= n2 && idx >= n1)&&(idy <= n21 && idy >= n11)) {
Hy_h[idx*ydim+idy]=A_h[idx*ydim+idy]*Hy_h[idx*ydim+idy]+B_h[idx*ydim+idy]*(Ezx_h_shared[i+1][j]-Ezx_h_shared[i][j]+Ezy_h[(idx+1)*ydim+idy]-Ezy_h[idx*ydim+idy]);
Hx_h[idx*ydim+idy]=G_h[idx*ydim+idy]*Hx_h[idx*ydim+idy]-H_h[idx*ydim+idy]*(Ezx_h_shared[i][j+1]-Ezx_h_shared[i][j]+Ezy_h[idx*ydim+idy+1]-Ezy_h[idx*ydim+idy]); }
}
需要索引技巧来使 BS_x * BS_y 线程块将 (BS_x+1)*(BS_y+1) 全局内存位置读取到共享内存。我相信这个选择比前一个更好,由于使用了共享内存,虽然并不是所有的访问都真正合并,见
我的问题是,如果你们中的任何人都可以在合并内存访问方面向我提出更好的解决方案。谢谢你。