cufftExecC2C随着 cuFFT 回调功能的引入,通过将归一化操作定义为函数,可以将 cuFFT 执行的逆 FFT 所需的归一化直接嵌入到调用中__device__。
除了 cuFFT 用户指南,有关 cuFFT 回调功能,请参阅
CUDA Pro 提示:使用 cuFFT 回调进行自定义数据处理
下面是通过 cuFFT 回调实现 IFFT 标准化的示例。
    #include <stdio.h>
    #include <assert.h>
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <cufft.h>
    #include <cufftXt.h>
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    /*********************/
    /* CUFFT ERROR CHECK */
    /*********************/
    // See http://stackoverflow.com/questions/16267149/cufft-error-handling
    #ifdef _CUFFT_H_
    static const char *_cudaGetErrorEnum(cufftResult error)
    {
        switch (error)
        {
            case CUFFT_SUCCESS:
                return "CUFFT_SUCCESS";
            case CUFFT_INVALID_PLAN:
                return "CUFFT_INVALID_PLAN";
            case CUFFT_ALLOC_FAILED:
                return "CUFFT_ALLOC_FAILED";
            case CUFFT_INVALID_TYPE:
                 return "CUFFT_INVALID_TYPE";
            case CUFFT_INVALID_VALUE:
                return "CUFFT_INVALID_VALUE";
            case CUFFT_INTERNAL_ERROR:
                return "CUFFT_INTERNAL_ERROR";
            case CUFFT_EXEC_FAILED:
                return "CUFFT_EXEC_FAILED";
            case CUFFT_SETUP_FAILED:
                return "CUFFT_SETUP_FAILED";
            case CUFFT_INVALID_SIZE:
                return "CUFFT_INVALID_SIZE";
            case CUFFT_UNALIGNED_DATA:
                return "CUFFT_UNALIGNED_DATA";
        }
        return "<unknown>";
    }
    #endif
    #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
    {
        if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
        }
    }
    __device__ void IFFT_Scaling(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPtr) {
        float *scaling_factor = (float*)callerInfo;
        float2 output;
        output.x = cuCrealf(element);
        output.y = cuCimagf(element);
        output.x = output.x / scaling_factor[0];
        output.y = output.y / scaling_factor[0];
        ((float2*)dataOut)[offset] = output;
}
    __device__ cufftCallbackStoreC d_storeCallbackPtr = IFFT_Scaling;
    /********/
    /* MAIN */
    /********/
    int main() {
        const int N = 16;
        cufftHandle plan;
        float2 *h_input             = (float2*)malloc(N*sizeof(float2));
        float2 *h_output1           = (float2*)malloc(N*sizeof(float2));
        float2 *h_output2           = (float2*)malloc(N*sizeof(float2));
        float2 *d_input;            gpuErrchk(cudaMalloc((void**)&d_input, N*sizeof(float2)));
        float2 *d_output1;          gpuErrchk(cudaMalloc((void**)&d_output1, N*sizeof(float2)));
        float2 *d_output2;          gpuErrchk(cudaMalloc((void**)&d_output2, N*sizeof(float2)));
        float *h_scaling_factor     = (float*)malloc(sizeof(float));
        h_scaling_factor[0] = 16.0f;
        float *d_scaling_factor;    gpuErrchk(cudaMalloc((void**)&d_scaling_factor, sizeof(float)));
        gpuErrchk(cudaMemcpy(d_scaling_factor, h_scaling_factor, sizeof(float), cudaMemcpyHostToDevice));
        for (int i=0; i<N; i++) {
            h_input[i].x = 1.0f;
            h_input[i].y = 0.f;
        }
        gpuErrchk(cudaMemcpy(d_input, h_input, N*sizeof(float2), cudaMemcpyHostToDevice));
        cufftSafeCall(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
        cufftSafeCall(cufftExecC2C(plan, d_input, d_output1, CUFFT_FORWARD));
        gpuErrchk(cudaMemcpy(h_output1, d_output1, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Direct transform - %d - (%f, %f)\n", i, h_output1[i].x, h_output1[i].y);
        cufftCallbackStoreC h_storeCallbackPtr;
        gpuErrchk(cudaMemcpyFromSymbol(&h_storeCallbackPtr, d_storeCallbackPtr, sizeof(h_storeCallbackPtr)));
        cufftSafeCall(cufftXtSetCallback(plan, (void **)&h_storeCallbackPtr, CUFFT_CB_ST_COMPLEX, (void **)&d_scaling_factor));
        cufftSafeCall(cufftExecC2C(plan, d_output1, d_output2, CUFFT_INVERSE));
        gpuErrchk(cudaMemcpy(h_output2, d_output2, N*sizeof(float2), cudaMemcpyDeviceToHost));
        for (int i=0; i<N; i++) printf("Inverse transform - %d - (%f, %f)\n", i, h_output2[i].x, h_output2[i].y);
        cufftSafeCall(cufftDestroy(plan));
        gpuErrchk(cudaFree(d_input));
        gpuErrchk(cudaFree(d_output1));
        gpuErrchk(cudaFree(d_output2));
        return 0;
    }
编辑
执行回调操作的“时刻”CUFFT_CB_ST_COMPLEX在对 的调用中指定cufftXtSetCallback。请注意,您可以使用相同的 cuFFT 计划加载和存储回调。