我正在尝试使用 Visual Studio 2010 在 CUDA 中开发 FFT 的实现,到目前为止,我已经让它在一个块内的 1024 个点上工作。问题是,每当我使用多个块时,块 1 的结果会正常,而其他块将返回错误值(看起来不是随机的,它们在多次运行中不会改变。)这是我的内核
__device__ void FFT(int idxS,int bfsize, Complex* data1, Complex* data0, int k, int N ){
Complex alpha;
if((idxS % bfsize) < (bfsize/2)){
data1[idxS] = ComplexAdd(data0[idxS],data0[idxS+bfsize/2]);
}
else
{
float angle = -PI*2*((idxS*(1<<k)%(bfsize/2)))/N;
alpha.x = cos(angle);
alpha.y= sin(angle);
Complex v0;
v0 = ComplexAdd(data0[idxS-bfsize/2] ,ComplexScale(data0[idxS],-1));
data1[idxS] = ComplexMul(v0, alpha);
}
}
__device__ void Ordenador(int r, int idxS ,Complex* data1, Complex* data0 ){
int p = 0;
for(int k = 0;k < r;k++)
{
if(idxS & (1<<k))
p+=1<<(r - k - 1);
}
data1[idxS] = data0[p];
__syncthreads();
}
__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
__syncthreads;
int bfsize = 1<<(r - k);
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
}
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (N,1,1);
const dim3 numBlocks(1,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
我试过使用共享内存,但是它会返回全 0,我认为 CUDA 没有从全局复制到共享(NSight 会告诉我该内存位置的值是????)。这段代码现在应该只是一个概念证明,不必优化,只需返回正确的值。如果你们需要完整的代码,我会提供。一个多月以来,我一直在寻找解决方案,这是我绝望的呼吁。
谢谢,约翰
- - - - 更新 - - - -
我出于调试目的更改了代码,在 2 个块中的每个块中启动了 2 个线程。
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (2,1,1);
const dim3 numBlocks(2,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
---------------------编辑2 ---------
真正奇怪的是,当我使用 memcheck(在任何模式下)时,程序会返回正确的结果。
----最终编辑---------------
我发现问题出在这段代码中
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
我发现在新函数中分离最后一行并用 CPU 调用它对我来说产生了正确的结果。感谢您的帮助!!此致!