2

我试图找到一个动态分配的数组的 fft。输入数组使用 . 从主机复制到设备cudaMemcpy2D。然后获取 fft (cufftExecR2C) 并将结果从设备复制回主机。

所以我最初的问题是如何使用 fft 中的音高信息。然后我在这里找到了答案 - CUFFT:如何计算倾斜指针的 fft?

但不幸的是,它不起作用。我得到的结果是垃圾值。下面给出的是我的代码。

#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h> 
#include <iomanip> 
#include <iostream>
#include <vector>

using namespace std;

const size_t NX = 4;
const size_t NY = 6;

int main()
    { 
    // Input array (static) - host side 
    float h_in_data_static[NX][NY] ={ 
        {0.7943 ,   0.6020 ,   0.7482  ,  0.9133  ,  0.9961 , 0.9261},
        {0.3112 ,   0.2630 ,   0.4505  ,  0.1524  ,  0.0782 ,  0.1782},
        {0.5285 ,   0.6541 ,   0.0838  ,  0.8258  ,  0.4427,  0.3842},
        {0.1656 ,   0.6892 ,   0.2290  ,  0.5383  ,  0.1067,  0.1712}
        };

    // --------------------------------
    // Input array (dynamic) - host side 
    float *h_in_data_dynamic = new float[NX*NY];  

    // Set the values
    size_t h_ipitch;
    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {    
        for (int c = 0; c < NY; ++c)
            {   h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c];   }
        }
    // --------------------------------

    // Output array - host side
    float2 *h_out_data_temp = new float2[NX*(NY/2+1)] ; 


    // Input and Output array - device side 
    cufftHandle plan;
    cufftReal *d_in_data;      
    cufftComplex * d_out_data;
    int n[NRANK] = {NX, NY};

    //  Copy input array from Host to Device
    size_t ipitch;
    cudaError  cudaStat1 =  cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);    
    cout << cudaGetErrorString(cudaStat1) << endl;
    cudaError  cudaStat2 =  cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);   
    cout << cudaGetErrorString(cudaStat2) << endl;

    //  Allocate memory for output array - device side
    size_t opitch;
    cudaError  cudaStat3 =  cudaMallocPitch((void**)&d_out_data,&opitch,(NY/2+1)*sizeof(cufftComplex),NX);  
    cout << cudaGetErrorString(cudaStat3) << endl;

    //  Performe the fft
    int rank = 2; // 2D fft     
    int istride = 1, ostride = 1; // Stride lengths
    int idist = 1, odist = 1;     // Distance between batches
    int inembed[] = {ipitch, NX}; // Input size with pitch
    int onembed[] = {opitch, NX}; // Output size with pitch
    int batch = 1;
    cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch);
    //cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
    cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
    cufftExecR2C(plan, d_in_data, d_out_data);
    cudaThreadSynchronize();

    // Copy d_in_data back from device to host
    cudaError  cudaStat4 = cudaMemcpy2D(h_out_data_temp,(NY/2+1)*sizeof(float2), d_out_data, opitch, (NY/2+1)*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost); 
    cout << cudaGetErrorString(cudaStat4) << endl;

    // Print the results
    for (int i = 0; i < NX; i++)    
        {
        for (int j =0 ; j< NY/2 + 1; j++)       
            printf(" %f + %fi",h_out_data_temp[i*(NY/2+1) + j].x ,h_out_data_temp[i*(NY/2+1) + j].y);
        printf("\n");    
        }
    cudaFree(d_in_data);

    return 0;
    }

我认为问题出在cufftPlanMany. 我该如何解决这个问题?

4

1 回答 1

8

您可能需要仔细研究文档的高级数据布局部分

我认为链接的上一个问题有些令人困惑,因为该问题以相反的顺序传递widthheight参数,这与我对 cufft 2D 计划的期望相同。但是,答案随后模仿了该顺序,因此至少是一致的。

其次,您在上一个问题中错过了正在传入的“音高”参数inembed与您从操作中onembed收到的音高参数不同。cudaMallocPitch它们必须按输入和输出数据集中每个数据元素的字节数进行缩放。我实际上并不完全确定这是inembedandonembed参数的预期用途,但它似乎有效。

当我调整您的代码以考虑上述两个更改时,我似乎得到了有效的结果,至少它们似乎在合理的范围内。您现在已经发布了几个关于 2D FFT 的问题,您说结果不正确。我无法在脑海中进行这些 2D FFT,因此我建议您将来指出您期望的数据。

这有我所做的更改:

#define NRANK 2
#define BATCH 10

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;

const size_t NX = 4;
const size_t NY = 6;

int main()
    {
    // Input array (static) - host side
    float h_in_data_static[NX][NY] ={
        {0.7943 ,   0.6020 ,   0.7482  ,  0.9133  ,  0.9961 , 0.9261},
        {0.3112 ,   0.2630 ,   0.4505  ,  0.1524  ,  0.0782 ,  0.1782},
        {0.5285 ,   0.6541 ,   0.0838  ,  0.8258  ,  0.4427,  0.3842},
        {0.1656 ,   0.6892 ,   0.2290  ,  0.5383  ,  0.1067,  0.1712}
        };

    // --------------------------------
    // Input array (dynamic) - host side
    float *h_in_data_dynamic = new float[NX*NY];

    // Set the values
    size_t h_ipitch;
    for (int r = 0; r < NX; ++r)  // this can be also done on GPU
        {
        for (int c = 0; c < NY; ++c)
            {   h_in_data_dynamic[NY*r + c] = h_in_data_static[r][c];   }
        }
    // --------------------------------
    int owidth = (NY/2)+1;

    // Output array - host side
    float2 *h_out_data_temp = new float2[NX*owidth] ;


    // Input and Output array - device side
    cufftHandle plan;
    cufftReal *d_in_data;
    cufftComplex * d_out_data;
    int n[NRANK] = {NX, NY};

    //  Copy input array from Host to Device
    size_t ipitch;
    cudaError  cudaStat1 =  cudaMallocPitch((void**)&d_in_data,&ipitch,NY*sizeof(cufftReal),NX);
    cout << cudaGetErrorString(cudaStat1) << endl;
    cudaError  cudaStat2 =  cudaMemcpy2D(d_in_data,ipitch,h_in_data_dynamic,NY*sizeof(float),NY*sizeof(float),NX,cudaMemcpyHostToDevice);
    cout << cudaGetErrorString(cudaStat2) << endl;

    //  Allocate memory for output array - device side
    size_t opitch;
    cudaError  cudaStat3 =  cudaMallocPitch((void**)&d_out_data,&opitch,owidth*sizeof(cufftComplex),NX);
    cout << cudaGetErrorString(cudaStat3) << endl;

    //  Performe the fft
    int rank = 2; // 2D fft
    int istride = 1, ostride = 1; // Stride lengths
    int idist = 1, odist = 1;     // Distance between batches
    int inembed[] = {NX, ipitch/sizeof(cufftReal)}; // Input size with pitch
    int onembed[] = {NX, opitch/sizeof(cufftComplex)}; // Output size with pitch
    int batch = 1;
    if ((cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch)) != CUFFT_SUCCESS) cout<< "cufft error 1" << endl;
    //cufftPlan2d(&plan, NX, NY , CUFFT_R2C);
    if ((cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE)) != CUFFT_SUCCESS) cout << "cufft error 2" << endl;
    if ((cufftExecR2C(plan, d_in_data, d_out_data)) != CUFFT_SUCCESS) cout << "cufft error 3" << endl;
    cudaDeviceSynchronize();

    // Copy d_in_data back from device to host
    cudaError  cudaStat4 = cudaMemcpy2D(h_out_data_temp,owidth*sizeof(float2), d_out_data, opitch, owidth*sizeof(cufftComplex), NX, cudaMemcpyDeviceToHost);
    cout << cudaGetErrorString(cudaStat4) << endl;

    // Print the results
    for (int i = 0; i < NX; i++)
        {
        for (int j =0 ; j< owidth; j++)
            printf(" %f + %fi",h_out_data_temp[i*owidth + j].x ,h_out_data_temp[i*owidth + j].y);
        printf("\n");
        }
    cudaFree(d_in_data);

    return 0;
    }
于 2013-12-31T06:58:06.700 回答