我制作了一个快速程序以确保我可以正确使用 cufft 库。当我运行“1”的批量大小时,我得到了我期望的结果。但是,随着我增加批处理大小,我在数据缓冲区的末尾得到了似乎是随机字节的内容。如果批量大小为 2,则最后三个条目是噪声。如果批处理大小为 3,我会在缓冲区末尾的最后六个条目中以及在批处理中三个转换中的第二个应该是结果的末尾的三个条目中得到噪声。
批处理中第二次变换结果末尾的坏数据示例:
7.680291 1.411589
<- 好的数据
7.748493 1.062853
7.797380 0.710554
7.826757 0.355854
-436781318144.000000 -436781318144.000000
<- 坏结果的开始
5349828096.000000 5000401408.000000
5511789568.000000 4813803008.000000
5664713728.000000 4619900416.000000
<- 输出结束
代码:
#define NX 1024
#define BATCH 4
#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <Windows.h>
#include <math.h>
int main()
{
cufftHandle plan;
cufftComplex *deviceData;
cufftComplex *hostData;
FILE* output;
char fileName[256];
int i, j;
cudaMalloc((void**)&deviceData, NX * BATCH * sizeof(cufftComplex));
hostData = (cufftComplex*)malloc(NX * BATCH * sizeof(cufftComplex);
//Initalize array with a real sin wave, increasing the frequency of the wave for each transform in the batch (indexed by "j")
for (j = 0; j < BATCH; j++)
{
for (i = 0; i < NX; i++)
{
hostData[i + j*BATCH].x = sin(i*(j+1) / (float)10);
hostData[i + j*BATCH].y = 0;
}
}
cudaMemcpy(deviceData, hostData, NX * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
cufftExecC2C(plan, deviceData, deviceData, CUFFT_FORWARD);
cudaThreadSynchronize();
cudaMemcpy(hostData, deviceData, NX * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(deviceData);
output = fopen("outputFile.txt", "w");
//Write one file for each transform in the batch
for (j = 0; j < BATCH; j++)
{
memset(fileName, '\0', 256);
sprintf(fileName, "outputFile_%d.txt", j);
output = fopen(fileName, "w");
for (i = 0; i < NX; i++)
fprintf(output, "%f\t%f\n", hostData[i + j*BATCH].x, hostData[i + j*BATCH].y);
fclose(output);
}
}