我正在 GPU 上实现一个简单的几何布朗运动。我的代码运行良好,即给出了正确的值。我担心的是关于我得到的加速,我期待更多。到目前为止,我有 2 个实现,一个只访问全局内存,速度提高了大约 3 倍,第二个是使用共享内存,速度提高了大约 2.3 倍。
我的问题是在使用 Nvidia Visual Profiler 分析应用程序后提出的。根据它,我的加载/存储效率为 100%,但 DRAM 利用率非常低(约 10%),并且由于非合并访问,全局内存重放几乎 50%。
一旦我看到我一直试图使用共享内存来避免全局内存访问,但令我惊讶的是 DRAM 变低了(4.5%),全局内存重放到 46.3%
我注意到我的内核启动中的占用率很低,因为我几乎使用了每个块的所有可用共享内存,但我不知道这是否可以解释第二种方法的性能较差。
您能否就性能方面可能发生的事情提供一些建议,以及我可能在哪里/可以寻找什么来尝试改进它?
CUDA_IMPLEMENTATION.CU
#define BLOCK_SIZE 64
#define SHMEM_ROWS 7 //The same as c_numTimeSteps = numTimeSteps
#define SHMEM_COLS BLOCK_SIZE
__constant__ double c_c1;
__constant__ double c_c2;
__constant__ int c_numTimeSteps;
__constant__ int c_numPaths;
__constant__ double c_timeNodes[2000];
__global__
void kernelSharedMem(double *rv, double *pb)
{
__shared__ double sh_rv[SHMEM_ROWS*SHMEM_COLS];
__shared__ double sh_pb[(SHMEM_ROWS+1)*SHMEM_COLS];
int p = blockDim.x * blockIdx.x + threadIdx.x;
//The idea of this outter loop is to have tiles along the rows
for(int tb = 0; tb < c_numTimeSteps; tb += SHMEM_ROWS)
{
//Copy values into shared memory
for(int is = tb, isSh = 0;
is < tb+SHMEM_ROWS && is < c_numTimeSteps;
is++, isSh++)
{
sh_rv[isSh*SHMEM_COLS+threadIdx.x] =
rv[is*c_numPaths+p];
}
sh_pb[threadIdx.x] = pb[tb*numPaths+p];
__syncthreads();
//Main computation in SHARED MEMORY
for(int isSh = 0; isSh < SHMEM_ROWS; isSh++)
{
double dt = c_timeNodes[isSh];
double sdt = sqrt(dt) * c_c1;
double mdt = c_c2 * dt;
sh_pb[(isSh+1)*SHMEM_COLS+threadIdx.x] =
sh_pb[isSh*SHMEM_COLS+threadIdx.x] *
exp(mdt + sdt * rv[isSh*SHMEM_COLS+threadIdx.x]);
}
__syncthreads();
for(int is = tb, isSh = 0;
is < tb+SHMEM_ROWS && is < c_numTimeSteps;
is++, isSh++)
{
pb[(is+1)*c_numPaths+p] =
sh_pb[(isSh+1)*SHMEM_COLS+threadIdx.x];
}
}
}
__global__
void kernelGlobalMem(double *rv, double *pb)
{
int p = blockDim.x * blockIdx.x + threadIdx.x;
for(int i = 0; i < c_numTimeSteps; i++)
{
double dt = c_timeNodes[i];
double sdt = sqrt(dt) * c_c1;
double mdt = c_c2 * dt;
pb[(i+1)*c_numPaths+p] =
pb[i*c_numPaths+p] *
exp(mdt + sdt * rv[i*c_numPaths+p]);
}
}
extern "C" void computePathGpu(vector<vector<double>>* rv,
vector<vector<double>>* pb,
int numTimeSteps, int numPaths,
vector<double> timeNodes,
double c1, double c2)
{
cudaMemcpyToSymbol(c_c1, &c1, sizeof(double));
cudaMemcpyToSymbol(c_c2, &c2, sizeof(double));
cudaMemcpyToSymbol(c_numTimeSteps, &numTimeSteps, sizeof(int));
cudaMemcpyToSymbol(c_numPaths, &numPaths, sizeof(int));
cudaMemcpyToSymbol(c_timeNodes, &(timeNodes[0]), sizeof(double)*numTimeSteps);
double *d_rv;
double *d_pb;
cudaMalloc((void**)&d_rv, sizeof(double)*numTimeSteps*numPaths);
cudaMalloc((void**)&d_pb, sizeof(double)*(numTimeSteps+1)*numPaths);
vector<vector<double>>::iterator itRV;
vector<vector<double>>::iterator itPB;
double *dst = d_rv;
for(itRV = rv->begin(); itRV != rv->end(); ++itRV)
{
double *src = &((*itRV)[0]);
size_t s = itRV->size();
cudaMemcpy(dst, src, sizeof(double)*s, cudaMemcpyHostToDevice);
dst += s;
}
cudaMemcpy(d_pb, &((*(pb->begin()))[0]),
sizeof(double)*(pb->begin())->size(), cudaMemcpyHostToDevice);
dim3 block(BLOCK_SIZE);
dim3 grid((numPaths+BLOCK_SIZE-1)/BLOCK_SIZE);
kernelGlobalMem<<<grid, block>>>(d_rv, d_pb);
//kernelSharedMem<<<grid, block>>>(d_rv, d_pb);
cudaDeviceSynchronize();
dst = d_pb;
for(itPB = ++(pb->begin()); itPB != pb->end(); ++itPB)
{
double *src = &((*itPB)[0]);
size_t s = itPB->size();
dst += s;
cudaMemcpy(src, dst, sizeof(double)*s, cudaMemcpyDeviceToHost);
}
cudaFree(d_pb);
cudaFree(d_rv);
}
主程序
extern "C" void computeOnGPU(vector<vector<double>>* rv,
vector<vector<double>>* pb,
int numTimeSteps, int numPaths,
vector<double> timeNodes,
double c1, double c2);
int main(){
int numTimeSteps = 7;
int numPaths = 2000000;
vector<vector<double>> rv(numTimeSteps, vector<double>(numPaths));
//Fill rv
vector<double> timeNodes(numTimeSteps);
//Fill timeNodes
vector<vector<double>> pb(numTimeSteps, vector<double>(numPaths, 0));
computeOnGPU(&rv, &pb, numTimeSteps, numPaths, timeNodes, 0.2, 0.123);
}