我是 Cuda 的新手。我试图以 Ricky 动量的形式求解具有初始条件的波动方程。代码的性能是 12 GFlops,虽然我的 GPU 性能是 3900。为什么代码对我来说如此无效,我该如何解决?
主文件
#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
if (argc <= 3)
{
perror("Error in argc: argc<=3 (wait h, tau, C) \n");
exit(1);
}
char *eptr;
errno = 0;
long long int size,tmax;
double tau,cour,h,C, cour2;
h = std::strtod(argv[1], &eptr);
tau = std::strtod(argv[2], &eptr);
C = std::strtod(argv[3], &eptr);
tmax = 2000;
cour = C*tau/h;
cour2 = cour* cour;
size = 18*13*1024;
double *nxt_layer=nullptr;
double *layer_1=nullptr;
double *layer_2=nullptr;
double *rev_layer=nullptr;
dim3 blockSize = dim3(1024);
dim3 gridSize = dim3(size/blockSize.x);
float time;
cudaTimer timer;
cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_1, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_2, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
for (int i = 0; i < size; ++i)
{
layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
}
for (int i = 1; i < size/2; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
for (int i = size/2; i < size-1; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
for (int i = 0; i < size-1; ++i)
{
layer_2[i] = layer_1[i];
layer_1[i] = nxt_layer[i];
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
timer.start();
for (double t = 0; t < tmax; t=t+tau)
{
step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
if (CHECK_ERROR(cudaDeviceSynchronize()))
throw(-1);
nxt_layer[size-1]=0;
nxt_layer[0]=0;
}
time = timer.stop();
for (int i = 0; i < size; ++i)
{
cout<<i*h<<" "<<nxt_layer[i]<<endl;
}
}
步骤.cu
inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}
__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
int node = threadIdx.x + blockDim.x * blockIdx.x;
if(node >= Nx-1 || node<=0) return;
double layer_1_tmp[3];
layer_1_tmp[0]=layer_1[node-1];
layer_1_tmp[1]=layer_1[node];
layer_1_tmp[2]=layer_1[node+1];
double layer_2_tmp=layer_2[node];
if(node<=Nx/2)
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
}
else
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
}
layer_2[node]=layer_1[node];
layer_1[node]=tmp_layer[node];
}
我将 GFlops 计算为
long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;
我将不胜感激您的任何意见和提示。