给定一个函数 u(x,y) 我想计算一个函数的拉普拉斯算子
我通过 cuFFT 的正向和逆变换来做到这一点。下面是代码
#include <cufft.h>
#include <stdio.h>
#include <math.h>
#define BSZ 8
#define gpuErrChk(ans) { checkCUDAError((ans), __FILE__, __LINE__); }
inline void checkCUDAError(cudaError_t code, const char *file, int line, bool abort=true)
{
//cudaError_t err = cudaGetLastError();
if( cudaSuccess != code)
{
fprintf(stderr, "CUDA Error in %s(%d): %s.\n", file, line, cudaGetErrorString(code) );
exit(EXIT_FAILURE);
}
}
__global__ void scale_drv(cufftComplex *ft, float *k, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
int j = threadIdx.y + blockIdx.y*blockDim.y;
int idx = j*(N/2+1)+i;
if (i<(N/2+1) && j<N)
{
float k2 = float(k[i]*k[i] + k[j]*k[j]);
ft[idx].x *= -k2;
ft[idx].y *= -k2;
}
}
int main(int argc, char **argv)
{
// Default problem size
int N = 64;
// User can change problem size
for (int i=0; i<argc; i++) {
if (strcmp("-N",argv[i]) == 0) {
N = atoi(argv[i+1]);
printf(" found -N %d\n", N);
}
}
// Allocate everything else
float xmax = 1.0f, xmin = 0.0f, ymin = 0.0f;
float h = (xmax-xmin)/((float)N), s = 0.1, s2 = s*s;
float *x, *y, *f, *f_a, *u_a, *k;
float r2;
size_t memSize = N*N*sizeof(float);
size_t vecSize = N*sizeof(float);
x = (float *)malloc(memSize);
y = (float *)malloc(memSize);
f = (float *)malloc(memSize);
f_a = (float *)malloc(memSize);
u_a = (float *)malloc(memSize);
k = (float *)malloc(vecSize);
// Set coordinates, RHS vector, and analytical solution
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
x[N*j+i] = xmin + i*h;
y[N*j+i] = ymin + j*h;
r2 = (x[N*j+i] - 0.5)*(x[N*j+i]-0.5) + (y[N*j+i] - 0.5)*(y[N*j+i]-0.5);
f_a[N*j+i] = (r2-2*s2)/(s2*s2)*exp(-r2/(2*s2));
u_a[N*j+i] = exp(-r2/(2*s2));
//u_a[N*j+i] = sin(2*M_PI*i/float(N))*sin(2*M_PI*j/float(N));
//f[N*j+i] = -8*M_PI*M_PI*u_a[N*j+i];
}
}
// Fourier coeff?
float freq = 1.f/N;
for (int i = 0; i <= N/2; i++) k[i] = i*2*M_PI*freq;
for (int i = N/2+1; i<N; i++) k[i] = (i - N)*2*M_PI*freq;
// Allocate device information
float *k_d, *u_d;
gpuErrChk(cudaMalloc((void**)&k_d, vecSize));
gpuErrChk(cudaMalloc((void**)&u_d, memSize));
gpuErrChk(cudaMemcpy(u_d, u_a, memSize, cudaMemcpyHostToDevice));
gpuErrChk(cudaMemcpy(k_d, k, vecSize, cudaMemcpyHostToDevice));
// Initiate cuFFT
cufftComplex *u_dk;
gpuErrChk(cudaMalloc((void**)&u_dk, sizeof(cufftComplex)*N*(N/2+1)));
// Create cuFFT plans
cufftHandle planf, planb;
size_t wrksize;
int dims[2] = {N,N};
cufftCreate(&planf);
cufftCreate(&planb);
cufftMakePlanMany(planf,2,dims,NULL,1,0,NULL,1,0,CUFFT_R2C,1,&wrksize);
cufftMakePlanMany(planb,2,dims,NULL,1,0,NULL,1,0,CUFFT_C2R,1,&wrksize);
// Forward transform
cufftExecR2C(planf, u_d, u_dk);
// Kernel: scale derivative
dim3 dimBlock (BSZ, BSZ);
dim3 dimGrid((N/2)/BSZ+1,N/BSZ);
scale_drv<<<dimGrid, dimBlock>>>(u_dk, k_d, N);
cudaDeviceSynchronize();
// Inverse transform
cufftExecC2R(planb, u_dk, u_d);
// Transfer data back to host
gpuErrChk(cudaMemcpy(f, u_d, memSize, cudaMemcpyDeviceToHost));
// Save expected vs compute solutions to file
FILE *fp1,*fp2;
fp1 = fopen("expected.dat","w");
fp2 = fopen("computed.dat","w");
float error = 0.0f;
for (int i=0; i<N*N; i++) {
fprintf(fp1,"%.3e\n",f_a[i]);
fprintf(fp2,"%.3e\n",f[i]);
// Calculate L2 error norm
error += (f_a[i] - f[i])*(f_a[i] - f[i]);
}
fclose(fp1);
fclose(fp2);
// Print L2 error norm
printf("error = %0.3e\n", sqrt(error));
// Clean up - list incomplete
cufftDestroy(planf);
cufftDestroy(planb);
return 0;
}
当我对 N = 4、8、16、32、64、128 和 256 的预期 f(x,y) 与计算出的 f(x,y) 进行 L2 误差范数比较时,我得到以下结果:
error = 2.190e+02
error = 8.118e+00
error = 2.887e-02
error = 6.510e-02
error = 1.715e-01
error = 5.509e-01
error = 2.874e+00
我预计误差范数会下降(就像 N = 4 到 16 一样),但是当 N = 32 及以上时它实际上会上升。为什么会这样?这是我计算 L2 错误范数的方法:
for (int i=0; i<N*N; i++) {
...
// Calculate L2 error norm
error += (f_a[i] - f[i])*(f_a[i] - f[i]);
}
...
// Print L2 error norm
printf("error = %0.3e\n", sqrt(error));
我想知道越来越多的数字是由污染错误引起的,还是实际上意味着 FFT 没有变得更加准确。任何输入表示赞赏