8

我正在尝试使用流异步启动多个 CUDA FFT 内核。为此,我正在创建我的流、cuFFT 正向和反向计划,如下所示:

streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)  
{
    cudaStreamCreate(&streams[i]);
    CHECK_ERROR(5)
    cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
    CHECK_ERROR(5)
    cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
    CHECK_ERROR(5)
    cufftSetStream(plansF[i],streams[i]);
    CHECK_ERROR(5)
    cufftSetStream(plansI[i],streams[i]);
    CHECK_ERROR(5)
}

main函数中,我按如下方式启动前向 FFT:

for(w=1;w<q;w++)
  {
    cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
    CHECK_ERROR(8)
    k += rect_small_real;
    j += rect_small_complex;
  }

我还有其他使用相同流异步启动的内核。

当我使用 Visual Profiler 5.0 分析我的应用程序时,我看到除了 CUDA FFT(正向和反向)之外的所有内核都并行运行并重叠。FFT 内核确实在不同的流中运行,但它们不会重叠,因为它们实际上是按顺序运行的。谁能告诉我我的问题是什么?

我的环境是 VS 2008,64 位,Windows 7。

谢谢。

4

3 回答 3

7

这是在 Kepler 架构上使用 CUDA 中的流执行 cuFFT 和 memcopies 的工作示例。

这是代码:

#include <stdio.h>

#include <cufft.h>

#define NUM_STREAMS 3

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 5000;

    // --- Host input data initialization
    float2 *h_in1 = new float2[N];
    float2 *h_in2 = new float2[N];
    float2 *h_in3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_in1[i].x = 1.f;
        h_in1[i].y = 0.f;
        h_in2[i].x = 1.f;
        h_in2[i].y = 0.f;
        h_in3[i].x = 1.f;
        h_in3[i].y = 0.f;
    }

    // --- Host output data initialization
    float2 *h_out1 = new float2[N];
    float2 *h_out2 = new float2[N];
    float2 *h_out3 = new float2[N];
    for (int i = 0; i < N; i++) {
        h_out1[i].x = 0.f;
        h_out1[i].y = 0.f;
        h_out2[i].x = 0.f;
        h_out2[i].y = 0.f;
        h_out3[i].x = 0.f;
        h_out3[i].y = 0.f;
    }

    // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
    gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));

    // --- Device input data allocation
    float2 *d_in1;          gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
    float2 *d_in2;          gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
    float2 *d_in3;          gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
    float2 *d_out1;         gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
    float2 *d_out2;         gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
    float2 *d_out3;         gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));

    // --- Creates CUDA streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));

    // --- Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int i = 0; i < NUM_STREAMS; i++) {
        cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
        cufftSetStream(plans[i], streams[i]);
    }

    // --- Async memcopyes and computations
    gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
    gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
    gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
    cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
    cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
    cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
    gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
    gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
    gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));

    for(int i = 0; i < NUM_STREAMS; i++)
        gpuErrchk(cudaStreamSynchronize(streams[i]));

    // --- Releases resources
    gpuErrchk(cudaHostUnregister(h_in1));
    gpuErrchk(cudaHostUnregister(h_in2));
    gpuErrchk(cudaHostUnregister(h_in3));
    gpuErrchk(cudaHostUnregister(h_out1));
    gpuErrchk(cudaHostUnregister(h_out2));
    gpuErrchk(cudaHostUnregister(h_out3));
    gpuErrchk(cudaFree(d_in1));
    gpuErrchk(cudaFree(d_in2));
    gpuErrchk(cudaFree(d_in3));
    gpuErrchk(cudaFree(d_out1));
    gpuErrchk(cudaFree(d_out2));
    gpuErrchk(cudaFree(d_out3));

    for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));

    delete[] h_in1;
    delete[] h_in2;
    delete[] h_in3;
    delete[] h_out1;
    delete[] h_out2;
    delete[] h_out3;

    cudaDeviceReset();  

    return 0;
}

请根据CUFFT 错误处理添加 cuFFT 错误检查。

下面提供了在 Kepler K20c 卡上测试上述算法时的一些分析信息。正如您将看到的,只有在您拥有足够大的N.

N = 5000

在此处输入图像描述

N = 50000

在此处输入图像描述

N = 500000

在此处输入图像描述

于 2014-08-04T20:58:06.153 回答
2

问题出在您使用的硬件上。

所有支持 CUDA 的 GPU 都能够同时以两种方式执行内核和复制数据。但是,只有具有 Compute Capability 3.5 的设备具有名为Hyper-Q的功能。

简而言之,在这些 GPU 中实现了几个(我想是 16 个)硬件内核队列。在以前的 GPU 中,可以使用一个硬件队列。

这意味着 cudaStreams 只是虚拟的,它们对旧硬件的使用只有在重叠计算和内存复制的情况下才有意义。当然,这不仅适用于 cuFFT,也适用于您自己的内核!

请深入了解视觉分析器的输出。您可能会无意中将时间线可视化视为 GPU 执行的确切数据。然而事情并没有那么简单。有几行显示的数据可能是指执行内核启动行的时间点(通常是橙色的)。这条线对应于 GPU 上特定内核的执行(蓝色矩形)。内存传输也是如此(确切时间显示为浅棕色矩形)。

希望,我帮助你解决了你的问题。

于 2013-11-27T13:20:46.060 回答
0

这是@JackOLantern 代码的 riff,它允许轻松改变 FFT 的数量、FFT 长度和流计数,以试验 nvvp 中的 GPU 利用率。

// Compile with:
// nvcc --std=c++11 stream_parallel.cu -o stream_parallel -lcufft

#include <iostream>

#include <cuda.h>
#include <cuda_runtime.h>

#include <cufft.h>

// Print file name, line number, and error code when a CUDA error occurs.
#define check_cuda_errors(val)  __check_cuda_errors__ ( (val), #val, __FILE__, __LINE__ )

template <typename T>
inline void __check_cuda_errors__(T code, const char *func, const char *file, int line) {
    if (code) {
    std::cout << "CUDA error at "
          << file << ":" << line << std::endl
          << "error code: " << (unsigned int) code
          << " type: \""  << cudaGetErrorString(cudaGetLastError()) << "\"" << std::endl
          << "func: \"" << func << "\""
          << std::endl;
    cudaDeviceReset();
    exit(EXIT_FAILURE);
    }
}

int main(int argc, char *argv[]) {

    // Number of FFTs to compute.
    const int NUM_DATA = 64;

    // Length of each FFT.
    const int N = 1048576;

    // Number of GPU streams across which to distribute the FFTs.
    const int NUM_STREAMS = 4;

    // Allocate and initialize host input data.
    float2 **h_in = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        h_in[ii] = new float2[N];
        for (int jj = 0; jj < N; ++jj) {
            h_in[ii][jj].x = (float) 1.f;
            h_in[ii][jj].y = (float) 0.f;
        }
    }

    // Allocate and initialize host output data.
    float2 **h_out = new float2 *[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
    h_out[ii] = new float2[N];
    for (int jj = 0; jj < N; ++jj) {
            h_out[ii][jj].x = 0.f;
            h_out[ii][jj].y = 0.f;
        }
    }

    // Pin host input and output memory for cudaMemcpyAsync.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostRegister(h_in[ii], N*sizeof(float2), cudaHostRegisterPortable));
        check_cuda_errors(cudaHostRegister(h_out[ii], N*sizeof(float2), cudaHostRegisterPortable));
    }

    // Allocate pointers to device input and output arrays.
    float2 **d_in = new float2 *[NUM_STREAMS];
    float2 **d_out = new float2 *[NUM_STREAMS];

    // Allocate intput and output arrays on device.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaMalloc((void**)&d_in[ii], N*sizeof(float2)));
        check_cuda_errors(cudaMalloc((void**)&d_out[ii], N*sizeof(float2)));
    }

    // Create CUDA streams.
    cudaStream_t streams[NUM_STREAMS];
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamCreate(&streams[ii]));
    }

    // Creates cuFFT plans and sets them in streams
    cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        cufftPlan1d(&plans[ii], N, CUFFT_C2C, 1);
        cufftSetStream(plans[ii], streams[ii]);
    }

    // Fill streams with async memcopies and FFTs.
    for (int ii = 0; ii < NUM_DATA; ii++) {
        int jj = ii % NUM_STREAMS;
        check_cuda_errors(cudaMemcpyAsync(d_in[jj], h_in[jj], N*sizeof(float2), cudaMemcpyHostToDevice, streams[jj]));
        cufftExecC2C(plans[jj], (cufftComplex*)d_in[jj], (cufftComplex*)d_out[jj], CUFFT_FORWARD);
        check_cuda_errors(cudaMemcpyAsync(h_out[jj], d_out[jj], N*sizeof(float2), cudaMemcpyDeviceToHost, streams[jj]));
    }

    // Wait for calculations to complete.
    for(int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaStreamSynchronize(streams[ii]));
    }

    // Free memory and streams.
    for (int ii = 0; ii < NUM_STREAMS; ii++) {
        check_cuda_errors(cudaHostUnregister(h_in[ii]));
        check_cuda_errors(cudaHostUnregister(h_out[ii]));
        check_cuda_errors(cudaFree(d_in[ii]));
        check_cuda_errors(cudaFree(d_out[ii]));
        delete[] h_in[ii];
        delete[] h_out[ii];
        check_cuda_errors(cudaStreamDestroy(streams[ii]));
    }

    delete plans;

    cudaDeviceReset();  

    return 0;
}
于 2016-11-01T17:35:52.483 回答