2

我正在尝试做一些基准测试以确保使用 CUDA 的统一内存(UM)方法不会损害我们的性能。

我正在执行 FFT。我使用 UM 的一种方式,我使用 cudaMalloc 的一种方式

之后我比较了结果,它们都匹配(这很好)。

但是,我为 UM 方法获得的时间是 ~.5ms 与 ~.04 的 cudaMalloc 方式相比(在多次运行平均之后)

我正在使用事件记录来进行计时。我在 cufftExecC2C 调用之前和之后都有一个。

此外,我添加了另外两个事件记录来测量任何内存传输到设备之前的时间,以及在我从设备取回数据后使用数据之后的时间。

这样做时,我看到 UM 方法需要约 1.6 毫秒,而 cudaMalloc 方法需要约 0.7 毫秒。

下面是执行 UM 方法的代码片段:

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();

cudaEventRecord(start_kernel);

cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

以下是针对 cudaMalloc 方法的

cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                        cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);

cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

使用统一内存方法加快速度时,我还能做些什么吗?我预计 UM 会慢一些,但不会这么慢。

我们在 Redhat 7.3 上使用 P100 和 Cuda 9

4

1 回答 1

2

您发布的代码的一个问题是您没有对 FFTcudaMemPrefetchAsync输出数据进行操作。根据我的测试,这有很大的不同。您的代码还有一些其他问题,例如我们没有调用cudaFree分配给malloc.

这是围绕您展示的内容构建的完整代码。当我在 CentOS7.4、CUDA 9.1、Tesla P100 上运行它时,我得到了在托管内存情况下执行的 FFT(3.52ms)与在非托管内存情况下执行的 FFT(3.45ms)的可比时间:

$ cat t43.cu
#include <cufft.h>
#include <iostream>
#include <string>

//using namespace std;
const int dataSize  = 1048576*32;
void setupWave(const int ds, cufftComplex *d){
  for (int i = 0; i < ds; i++){
    d[i].x = 1.0f;
    d[i].y = 0.0f;}
}
int main(){

cufftComplex *inData, *outData;

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
cudaDeviceSynchronize();

cudaEventRecord(start_kernel);

cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms,  Overall: " + std::to_string(overallUmTime) + "\n";

std::cout << resultString_um;
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaFree(inData);
cudaFree(outData);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);



cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
//cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

//cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
//                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                        cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);

cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                cudaMemcpyDefault);

 sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for non-UM is " << sum << std::endl;

//float umTime = 0;
//float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um = std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
free(outData);
free(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

}
$ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
$ ./t43
sum for UM is 3.35544e+07
33554432 samples took 3.520640ms,  Overall: 221.909988
sum for non-UM is 3.35544e+07
33554432 samples took 3.456160ms,  Overall: 278.099426
$
于 2018-02-05T17:30:26.723 回答