matlab - 通过 cuFFT 在逆 FFT 中进行缩放

Question

每当我使用 cuFFT 绘制程序获得的值并将结果与 Matlab 的结果进行比较时，我都会得到相同形状的图形，并且最大值和最小值的值在相同的点上。但是，由 cuFFT 得出的值远大于由 Matlab 得出的值。Matlab代码是

fs = 1000;                              % sample freq
D = [0:1:4]';                           % pulse delay times
t = 0 : 1/fs : 4000/fs;                 % signal evaluation time
w = 0.5;                                % width of each pulse
yp = pulstran(t,D,'rectpuls',w);
filt = conj(fliplr(yp));
xx = fft(yp,1024).*fft(filt,1024);
xx = (abs(ifft(xx)));

具有相同输入的 CUDA 代码如下：

cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal,     CUFFT_FORWARD);
ComplexPointwiseMul<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX);
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);

cuFFT 还执行1024批量大小为的点 FFT 2。

使用的比例因子时NX=1024，值不正确。请告诉我该怎么做。

score 5 · Accepted Answer

这是从未回答列表中删除此问题的较晚答案。

您没有提供足够的信息来诊断您的问题，因为您没有指定设置 cuFFT 计划的方式。您甚至没有指定 Matlab 和 cuFFT 信号的形状是否完全相同（因此您只有一个缩放比例），或者您是否具有大致相同的形状。但是，让我提出以下两个意见：

yp向量有元素4000；与它相反，您通过将信号截断为元素fft(yp,1024)来执行 FFT ；1024
逆 cuFFT 不按向量元素的数量进行缩放。

为了方便起见（它可能对其他用户有用），我在下面报告一个简单的 FFT-IFFT 方案，其中还包括使用 CUDA Thrust 库执行的缩放。

#include <cufft.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

/*********************/
/* SCALE BY CONSTANT */
/*********************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

int main(void){

    const int N=4;

    // --- Setting up input device vector    
    thrust::device_vector<float2> d_vec(N,make_cuComplex(1.f,2.f));

    cufftHandle plan;
    cufftPlan1d(&plan, N, CUFFT_C2C, 1);

    // --- Perform in-place direct Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_FORWARD);

    // --- Perform in-place inverse Fourier transform
    cufftExecC2C(plan, thrust::raw_pointer_cast(d_vec.data()),thrust::raw_pointer_cast(d_vec.data()), CUFFT_INVERSE);

    thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), Scale_by_constant((float)(N)));

    // --- Setting up output host vector    
    thrust::host_vector<float2> h_vec(d_vec);

    for (int i=0; i<N; i++) printf("Element #%i; Real part = %f; Imaginary part: %f\n",i,h_vec[i].x,h_vec[i].y);

    getchar();
}

score 4 · Accepted Answer

cufftExecC2C随着 cuFFT 回调功能的引入，通过将归一化操作定义为函数，可以将 cuFFT 执行的逆 FFT 所需的归一化直接嵌入到调用中__device__。

除了 cuFFT 用户指南，有关 cuFFT 回调功能，请参阅

CUDA Pro 提示：使用 cuFFT 回调进行自定义数据处理

下面是通过 cuFFT 回调实现 IFFT 标准化的示例。

    #include <stdio.h>
    #include <assert.h>

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <cufft.h>
    #include <cufftXt.h>

    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }

    /*********************/
    /* CUFFT ERROR CHECK */
    /*********************/
    // See http://stackoverflow.com/questions/16267149/cufft-error-handling
    #ifdef _CUFFT_H_
    static const char *_cudaGetErrorEnum(cufftResult error)
    {
        switch (error)
        {
            case CUFFT_SUCCESS:
                return "CUFFT_SUCCESS";

            case CUFFT_INVALID_PLAN:
                return "CUFFT_INVALID_PLAN";

            case CUFFT_ALLOC_FAILED:
                return "CUFFT_ALLOC_FAILED";

            case CUFFT_INVALID_TYPE:
                 return "CUFFT_INVALID_TYPE";

            case CUFFT_INVALID_VALUE:
                return "CUFFT_INVALID_VALUE";

            case CUFFT_INTERNAL_ERROR:
                return "CUFFT_INTERNAL_ERROR";

            case CUFFT_EXEC_FAILED:
                return "CUFFT_EXEC_FAILED";

            case CUFFT_SETUP_FAILED:
                return "CUFFT_SETUP_FAILED";

            case CUFFT_INVALID_SIZE:
                return "CUFFT_INVALID_SIZE";

            case CUFFT_UNALIGNED_DATA:
                return "CUFFT_UNALIGNED_DATA";
        }

        return "<unknown>";
    }
    #endif

    #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
    {
        if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
        }
    }

    __device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

        float *scaling_factor = (float*)callerInfo;

        float2 output;
        output.x = cuCrealf(element);
        output.y = cuCimagf(element);

        output.x = output.x / scaling_factor[0];
        output.y = output.y / scaling_factor[0];

        ((float2*)dataOut)[offset] = output;
}

    __device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

    /********/
    /* MAIN */
    /********/
    int main() {

        const int N = 16;

        cufftHandle plan;

        float2 *h_input             = (float2*)malloc(N*sizeof(float2));
        float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
        float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

        float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
        float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
        float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

        float *h_scaling_factor     = (float*)malloc(sizeof(float));
        h_scaling_factor[0] = 16.0f;
        float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
        gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

        for (int i=0; i<N; i++) {
            h_input[i].x = 1.0f;
            h_input[i].y = 0.f;
        }

        gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));

        cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

        cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
        gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);

        cufftCallbackStoreC h_storeCallbackPtr;
        gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));

        cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

        cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
        gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);

        cufftSafeCall(cufftDestroy(plan));

        gpuErrchk(cudaFree(d_input));
        gpuErrchk(cudaFree(d_output1));
        gpuErrchk(cudaFree(d_output2));

        return 0;
    }

编辑

执行回调操作的“时刻”CUFFT_CB_ST_COMPLEX在对的调用中指定cufftXtSetCallback。请注意，您可以使用相同的 cuFFT 计划加载和存储回调。

score 2 · Accepted Answer

表现

对于 IFFT 缩放的这种特殊情况，我正在添加一个进一步的答案，以将回调性能与相同代码的非回调版本进行比较。我正在使用的代码是

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <cufft.h>
#include <cufftXt.h>

#include <thrust/device_vector.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

//#define DISPLAY

/*******************************/
/* THRUST FUNCTOR IFFT SCALING */
/*******************************/
class Scale_by_constant
{
    private:
        float c_;

    public:
        Scale_by_constant(float c) { c_ = c; };

        __host__ __device__ float2 operator()(float2 &a) const
        {
            float2 output;

            output.x = a.x / c_;
            output.y = a.y / c_;

            return output;
        }

};

/**********************************/
/* IFFT SCALING CALLBACK FUNCTION */
/**********************************/
__device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {

    float *scaling_factor = (float*)callerInfo;

    float2 output;
    output.x = cuCrealf(element);
    output.y = cuCimagf(element);

    output.x = output.x / scaling_factor[0];
    output.y = output.y / scaling_factor[0];

    ((float2*)dataOut)[offset] = output;
}

__device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;

/********/
/* MAIN */
/********/
int main() {

    const int N = 100000000;

    cufftHandle plan;           cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

    TimingGPU timerGPU;

    float2 *h_input             = (float2*)malloc(N*sizeof(float2));
    float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
    float2 *h_output2           = (float2*)malloc(N*sizeof(float2));

    float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
    float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
    float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));

    // --- Callback function parameters
    float *h_scaling_factor     = (float*)malloc(sizeof(float));
    h_scaling_factor[0] = 16.0f;
    float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
    gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));

    // --- Initializing the input on the host and moving it to the device
    for (int i = 0; i < N; i++) {
        h_input[i].x = 1.0f;
        h_input[i].y = 0.f;
    }
    gpuErrchk(cudaMemcpy(d_input, h_input, N * sizeof(float2), cudaMemcpyHostToDevice));

    // --- Execute direct FFT on the device and move the results to the host
    cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output1, d_output1, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
#endif

    // --- Execute inverse FFT with subsequent scaling on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
    thrust::transform(thrust::device_pointer_cast(d_output2), thrust::device_pointer_cast(d_output2) + N, thrust::device_pointer_cast(d_output2), Scale_by_constant((float)(N)));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing NO callback %f\n", timerGPU.GetCounter());

    // --- Setup store callback
//    timerGPU.StartCounter();
    cufftCallbackStoreC h_storeCallbackPtr;
    gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
    cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));

    // --- Execute inverse callback FFT on the device and move the results to the host
    timerGPU.StartCounter();
    cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
#ifdef DISPLAY
    gpuErrchk(cudaMemcpy(h_output2, d_output2, N * sizeof(float2), cudaMemcpyDeviceToHost));
    for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
#endif
    printf("Timing callback %f\n", timerGPU.GetCounter());

    cufftSafeCall(cufftDestroy(plan));

    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output1));
    gpuErrchk(cudaFree(d_output2));

    return 0;
}

对于如此大的一维阵列和简单的处理（缩放），Kepler K20c 上的时序如下

Non-callback 69.029762 ms
Callback     65.868607 ms

所以，并没有太大的改善。我希望看到的改进是由于在非回调情况下避免了单独的内核调用。对于较小的一维数组，要么没有改进，要么非回调情况运行得更快。

matlab - 通过 cuFFT 在逆 FFT 中进行缩放

3 回答 3

Related

Reference