c++ - VS2013/Cuda7.0 中的 CUFFT 比 VS2010/Cuda4.2 慢 1000 倍

Question

这个简单的 CUFFT 代码在两个 IDE 上运行 -

VS 2013 与 Cuda 7.0
VS 2010 与 Cuda 4.2

我发现带有 Cuda 7.0 的 VS 20131000大约慢了一倍。平均而言，代码0.6 ms在 VS 2010 中执行，并在 VS 2013 中执行。520 ms

#include "stdafx.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "cufft.h"
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    const int SIZE = 10000;
    Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    Complex *d_col;
    cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    const int BATCH = 1;
    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds;

    return 0;
}

代码在同一台计算机上运行，具有相同的操作系统、相同的显卡，并且一个接一个地运行。两种情况下的配置都是 x64 Release。您可以选择是使用 C++ 编译器还是 CUDA C/C++ 编译文件。我在两个项目上都尝试了这两种选择，但没有任何区别。

有什么想法可以解决这个问题吗？

FWIW，我在 VS 2013 上使用 Cuda 6.5 得到与 Cuda 7 相同的结果

score 6 · Accepted Answer

cufft 库从 4.2 到 7.0 变得相当大，这导致初始化时间大大增加。如果你去掉这个初始化时间这个因素，我想你会发现执行时间的差异远小于 1000 倍。

这是一个修改后的代码，演示了这一点：

$ cat t807.cu
#include <cufft.h>
#include <cuComplex.h>
typedef cuComplex Complex;
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    const int SIZE = 10000;
    Complex *h_col = (Complex*)malloc(SIZE*sizeof(Complex));
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    Complex *d_col;
    cudaMalloc((void**)&d_col, SIZE*sizeof(Complex));
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftHandle plan;
    const int BATCH = 1;
    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds << endl;

    cudaEventRecord(start);
    for (int i = 0; i < SIZE; i++)
    {
        h_col[i].x = i;
        h_col[i].y = i;
    }
    cudaMemcpy(d_col, h_col, SIZE*sizeof(Complex), cudaMemcpyHostToDevice);

    cufftPlan1d(&plan, SIZE, CUFFT_C2C, BATCH);
    cufftExecC2C(plan, d_col, d_col, CUFFT_FORWARD);

    cudaMemcpy(h_col, d_col, SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cufftDestroy(plan);
    cout << milliseconds << endl;

    return 0;
}
$ nvcc -o t807 t807.cu -lcufft
$ ./t807
94.8298
1.44778
$

上面的第二个数字表示基本上相同的代码，但移除了袖带初始化（因为它是在第一次通过时完成的）。

c++ - VS2013/Cuda7.0 中的 CUFFT 比 VS2010/Cuda4.2 慢 1000 倍

1 回答 1

Related

Reference