我是使用 cuda 和岩浆库的新手。我正在尝试一些关于测试问题的函数,一个 2D 热方程。我编写的代码对于 32、64 和 128 的网格大小似乎非常有效。但是对于 256 或更大的网格大小,它会产生错误的结果。我只在这里发布部分代码,足以重现错误。传输最终矩阵并在 matlab 中查看它表明第二次调用 magmablas_dgemm 将错误引入解决方案。
有没有人可以看到为什么这个代码会因为更大的网格尺寸而中断?
int main(int argc, char* argv[])
{
// Get parameters for problem set up
int side_width = atoi(argv[1]); //assuming square grid, N/32 integer
double dx = 2.0 / (side_width-1);
double dt = 0.25 * dx;
//double Tend = dt*3;// 0.5;
// create memory pointers for derivative operator matrices and solution matrix
double* U;
double* Dleft;
double* Dright;
double* dev_U;
double* dev_Dleft;
double* dev_Dright;
//initialize the MAGMA system
magma_init();
magma_int_t N = side_width;
// temp variables required by MAGMA functions
magma_int_t *piv, info, err;
piv = (magma_int_t*)malloc(N*sizeof(magma_int_t));
// Allocate memory for matrices on host and device
err = magma_dmalloc_cpu(&U, N*N);
err += magma_dmalloc_cpu(&Dleft, N*N);
err += magma_dmalloc_cpu(&Dright, N*N);
err += magma_dmalloc(&dev_U, N*N);
err += magma_dmalloc(&dev_Dleft, N*N);
err += magma_dmalloc(&dev_Dright, N*N);
if (err){
printf("error in allocation. err number = %d\n", err);
exit(1);
}
// zero out matrices (not efficient but correct)
for (int k=0; k<N*N; ++k ){
U[k] = 1.0;
Dleft[k] = 0.0;
Dright[k] = 0.0;
}
//create derivative operator matrices
double a = dt/2.0/dx/dx;
double b = dt/dx/dx;
Dleft[0] = 1.0;
Dleft[N*N-1] = 1.0;
for (int k=1; k<N-1; ++k) {
Dleft[k*N + k-1] = -a;
Dleft[k*N + k] = 1+b;
Dleft[k*N + k+1] = -a;
Dright[k*N + k-1] = a;
Dright[k*N + k] = 1-b;
Dright[k*N + k+1] = a;
}
// Determine block and thread amounts
int grid_dim = ((side_width + 31)/32) ;
int block_dim = 32;
dim3 gridDim(grid_dim, grid_dim);
dim3 blockDim(block_dim, block_dim);
//copy data from host to device
magma_dsetmatrix(N, N, U, N, dev_U, N);
magma_dsetmatrix(N, N, Dleft, N, dev_Dleft, N);
magma_dsetmatrix(N, N, Dright, N, dev_Dright, N);
// LU factorize the left hand operator matrix
magma_dgetrf_gpu(N, N, dev_Dleft, N, piv, &info);
double tn = 0; //time counter
// needed to take first step outside while loop because of some tricky transpose nonsense happening
tn += dt;
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans,MagmaNoTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
// implicit step solve : Dleft*U=Uhat
magma_dgetrs_gpu(MagmaTrans, N, N, dev_Dleft, N, piv, dev_U, N, &info);
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans, MagmaTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
printf("GPU matrix U at time %3.3f \n ", tn);
magma_dprint_gpu(16, 16, dev_U, N);
//copy solution from device to host
magma_dgetmatrix(N, N, dev_U, N, U, N);
//write data to file
char filename[256];
char str_t[128];
sprintf(str_t, "%d", N );
sprintf(filename, "ADI_%s.bin", str_t);
FILE* fileID = fopen(filename, "wb");
for (int i=0; i<N*N; ++i){
fwrite(&U[i],sizeof(double),1,fileID);
}
fclose(fileID);
free(U);
free(Dleft);
free(Dright);
magma_free(dev_U);
magma_free(dev_Dleft);
magma_free(dev_Dright);
free(piv);
magma_finalize();
return 0;
}