cuda - Cuda 统一内存与 cudaMalloc

Question

我正在尝试做一些基准测试以确保使用 CUDA 的统一内存（UM）方法不会损害我们的性能。

我正在执行 FFT。我使用 UM 的一种方式，我使用 cudaMalloc 的一种方式

之后我比较了结果，它们都匹配（这很好）。

但是，我为 UM 方法获得的时间是 ~.5ms 与 ~.04 的 cudaMalloc 方式相比（在多次运行平均之后）

我正在使用事件记录来进行计时。我在 cufftExecC2C 调用之前和之后都有一个。

此外，我添加了另外两个事件记录来测量任何内存传输到设备之前的时间，以及在我从设备取回数据后使用数据之后的时间。

这样做时，我看到 UM 方法需要约 1.6 毫秒，而 cudaMalloc 方法需要约 0.7 毫秒。

下面是执行 UM 方法的代码片段：

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();

cudaEventRecord(start_kernel);

cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

以下是针对 cudaMalloc 方法的

cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                        cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);

cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um += std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";

cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

使用统一内存方法加快速度时，我还能做些什么吗？我预计 UM 会慢一些，但不会这么慢。

我们在 Redhat 7.3 上使用 P100 和 Cuda 9

score 2 · Accepted Answer

您发布的代码的一个问题是您没有对 FFTcudaMemPrefetchAsync的输出数据进行操作。根据我的测试，这有很大的不同。您的代码还有一些其他问题，例如我们没有调用cudaFree分配给malloc.

这是围绕您展示的内容构建的完整代码。当我在 CentOS7.4、CUDA 9.1、Tesla P100 上运行它时，我得到了在托管内存情况下执行的 FFT（3.52ms）与在非托管内存情况下执行的 FFT（3.45ms）的可比时间：

$ cat t43.cu
#include <cufft.h>
#include <iostream>
#include <string>

//using namespace std;
const int dataSize  = 1048576*32;
void setupWave(const int ds, cufftComplex *d){
  for (int i = 0; i < ds; i++){
    d[i].x = 1.0f;
    d[i].y = 0.0f;}
}
int main(){

cufftComplex *inData, *outData;

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));

cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
cudaDeviceSynchronize();

cudaEventRecord(start_kernel);

cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

float sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for UM is " << sum << std::endl;

float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms,  Overall: " + std::to_string(overallUmTime) + "\n";

std::cout << resultString_um;
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaFree(inData);
cudaFree(outData);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);



cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
//cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);

//cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
//                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);

setupWave(dataSize, inData);

cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
                                        cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);

cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);

cudaEventRecord(stop_kernel);

cudaEventSynchronize(stop_kernel);

cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
                cudaMemcpyDefault);

 sum = 0;
for (int i = 0; i < dataSize; i++) {
        sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);

std::cout << "sum for non-UM is " << sum << std::endl;

//float umTime = 0;
//float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
                stop_after_memDtoH);

resultString_um = std::to_string(dataSize) + " samples took "
                + std::to_string(umTime) + "ms,  Overall: "
                + std::to_string(overallUmTime) + "\n";
std::cout << resultString_um;
free(outData);
free(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);

cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);

cufftDestroy(plan);

}
$ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
$ ./t43
sum for UM is 3.35544e+07
33554432 samples took 3.520640ms,  Overall: 221.909988
sum for non-UM is 3.35544e+07
33554432 samples took 3.456160ms,  Overall: 278.099426
$

cuda - Cuda 统一内存与 cudaMalloc

1 回答 1

Related

Reference