2

我正在 CUDA 中设置一维 fftshift。我的代码如下

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    double2 temp;

    if(i< N/2)
    {
        temp.x =  u_d[i].x;
        temp.y =  u_d[i].y;

        u_d[i].x =u_d[i+N/2].x;
        u_d[i].y =u_d[i+N/2].y;

        u_d[i+N/2].x = temp.x;
        u_d[i+N/2].y = temp.y;
    }
}

有没有比上面显示的更聪明的方法来在 CUDA 中执行 fftshift?

提前致谢。

也许更好的解决方案

我发现也许以下解决方案可能是一个不错的选择

__global__ void fftshift(double2 *u_d, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < N)
    {
        double a = pow(-1.0,i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}

它包括将要转换的向量乘以1s 和-1s 的序列,这等效于乘以 exp(-j n pi) 并因此等效于共轭域中的移位。

您必须在应用 CUFFT 之前和之后调用此内核。

一个优点是避免了内存移动/交换,这个想法可以立即扩展到 2D 案例,请参阅CUDA Device To Device transfer成本昂贵

关于对称数据

该解决方案似乎不仅限于对称数据。试试下面的 Matlab 代码,将这个想法应用于完全复杂的随机矩阵(高斯幅度和均匀相位)。

N1=512;
N2=256;

Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);

Im=Magnitude.*exp(j*Phase);

Transform=fftshift(fft2(ifftshift(Im)));

n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);

100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))

返回的归一化均方根误差将是0,确认Transform=Im4

提高速度

根据NVIDIA 论坛收到的建议,可以通过更改指令来提高速度

double a = pow(-1.0,i&1);

double a = 1-2*(i&1);

避免使用缓慢的套路pow

4

2 回答 2

3

经过大量时间和 cuFFT 回调功能的介绍,我可以为我自己的问题提供一个有意义的答案。

上面我提出了一个“也许更好的解决方案”。经过一些测试,我意识到,如果不使用回调 cuFFT 功能,该解决方案会更慢,因为它使用pow. 然后,我探索了使用的两种替代方法pow,例如

float a     = (float)(1-2*((int)offset%2));

float2  out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;

float2  out = ((float2*)d_in)[offset];

if ((int)offset&1) {

    out.x = -out.x;
    out.y = -out.y;

}

但是,对于标准 cuFFT,上述所有解决方案都需要两个单独的内核调用,一个用于 fftshift,另一个用于 cuFFT 执行调用。但是,使用新的 cuFFT 回调功能,上述替代解决方案可以作为__device__函数嵌入到代码中。

所以,最后我得到了下面的比较代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

#include <cufft.h>
#include <cufftXt.h>

//#define DEBUG

#define BLOCKSIZE 256

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
             return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}
#endif

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N/2)
    {
        float2 temp = d_inout[tid];
        d_inout[tid] = d_inout[tid + (N / 2)];
        d_inout[tid + (N / 2)] = temp;
    }
}

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a     = (float)(1-2*((int)offset%2));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float a = pow(-1.,(double)(offset&1));

    float2  out = ((float2*)d_in)[offset];
    out.x = out.x * a;
    out.y = out.y * a;
    return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

    float2  out = ((float2*)d_in)[offset];

    if ((int)offset&1) {

        out.x = -out.x;
        out.y = -out.y;

    }
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;

/********/
/* MAIN */
/********/
int main()
{
    const int N = 131072;

    printf("N = %d\n", N);

    // --- Host side input array
    float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
    for (int i=0; i<N; i++) {
        h_vect[i].x = (float)rand() / (float)RAND_MAX;
        h_vect[i].y = (float)rand() / (float)RAND_MAX;
    }

    // --- Host side output arrays
    float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
    float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));

    // --- Device side input arrays
    float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
    float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
    float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
    float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
    gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));

    // --- Device side output arrays
    float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
    float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    /*******************************************/
    /* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
    /*******************************************/
    cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
    fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Memory movements elapsed time:  %3.3f ms \n", time);
    gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V1 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
    cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
    cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v1 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }

    printf("Chessboard v1 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V2 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
    cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v2 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }

    printf("Chessboard v2 test passed!\n");

    /****************************************/
    /* CHESSBOARD MULTIPLICATION V3 + cuFFT */
    /****************************************/
    cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;

    gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
    cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
    status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
    if (status == CUFFT_LICENSE_ERROR) {
        printf("This sample requires a valid license file.\n");
        printf("The file was either not found, out of date, or otherwise invalid.\n");
        exit(EXIT_FAILURE);
    } else {
        cufftSafeCall(status);
    }
    cudaEventRecord(start, 0);
    cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Chessboard v3 elapsed time:  %3.3f ms \n", time);

    gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }

    printf("Chessboard v3 test passed!\n");

    return 0;
}

GTX 480 上的结果

N         Mem mov   v1      v2      v3 
131072    0.552     0.136   0.354   0.183 
262144    0.536     0.175   0.451   0.237 
524288    0.661     0.283   0.822   0.290 
1048576   0.784     0.565   1.548   0.548 
2097152   1.298     0.952   2.973   0.944 

特斯拉 C2050 的结果

N         Mem mov   v1      v2      v3 
131072    0.278     0.130   0.236   0.132 
262144    0.344     0.202   0.374   0.206 
524288    0.544     0.378   0.696   0.387 
1048576   0.909     0.695   1.294   0.695 
2097152   1.656     1.349   2.531   1.349 

开普勒 K20c 的结果

N         Mem mov   v1      v2      v3 
131072    0.077     0.076   0.136   0.076 
262144    0.142     0.128   0.202   0.127 
524288    0.268     0.229   0.374   0.230 
1048576   0.516     0.433   0.717   0.435 
2097152   1.019     0.853   1.400   0.855 
于 2014-10-03T22:35:55.613 回答
2

如果空间不是问题(并且仅对一个维度使用 fftshift),则创建大小为 1.5 x N,并在末尾u_d写入第一个元素。N/2然后你可以移动u_du_d + N / 2

这是你可以做到的。

double2 *u_d, *u_d_begin;
size_t bytes = N * sizeof(double2);
// This is different from bytes / 2 when N is odd
size_t half_bytes = (N / 2) * sizeof(double2);
CUDA_CHK(cudaMalloc( &u_d, bytes + half_bytes ));
u_d_begin = u_d;
...
// Do some processing and populate u_d;
...
// Copy first half to the end
CUDA_CHK(cudaMemcpy(u_d + N, u_d, half_bytes, cudaMemcpyDeviceToDevice));
u_d = u_d + N /2;
于 2013-01-06T22:45:56.070 回答