我有两个任务。它们中的每一个都执行复制到设备 (D)、运行内核 (R) 和复制到主机 (H) 操作。我将副本复制到 task2 (D2) 的设备与 task1 (R1) 的运行内核。另外,我将task2(R2)的运行内核与复制到task1(H1)的主机重叠。
我还使用 cudaEventRecord 记录每个任务的 D、R、H ops 的开始和停止时间。
我有 GeForce GT 555M、CUDA 4.1 和 Fedora 16。
我有三种情况:
场景 1:我为每个任务使用一个流。我在操作之前/之后放置开始/停止事件。
场景 2:我为每个任务使用一个流。我将第二个重叠操作的开始事件放在第一个开始之前(即,将start R1放在start D2之前,并将start H1放在start R2之前)。
场景 3:我为每个任务使用两个流。我使用 cudaStreamWaitEvents 在这两个流之间进行同步。一个流用于 D 和 H(复制)操作,另一个用于 R op。我在操作之前/之后放置开始/停止事件。
Scenario1无法重叠 ops(D2-R1 和 R2-H1 都不能重叠),而Scenario2和Scenario3成功。我的问题是:为什么 Scenerio1 失败了,而其他的成功了?
对于每个场景,我都会测量执行 Task1 和 Task2 的总时间。运行 R1 和 R2 分别需要 5 ms。由于Scenario1无法重叠 ops,因此总时间比Scenario 2 和 3多 10ms 。
以下是场景的伪代码:
场景 1(失败):任务 1 使用流 1,任务 2 使用流 2
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start D2 on stream2
D2 on stream2
stop D2 on stream2
start R1 on stream1
R1 on stream1
stop R1 on stream1
start R2 on stream2
R2 on stream2
stop R2 on stream2
start H1 on stream1
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
场景2(成功): task1使用stream1,task2使用stream2,上移第二个重叠操作的开始事件。
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start R1 on stream1 //moved-up
start D2 on stream2
D2 on stream2
stop D2 on stream2
R1 on stream1
stop R1 on stream1
start H1 on stream1 //moved-up
start R2 on stream2
R2 on stream2
stop R2 on stream2
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
场景 3(成功): task1 使用 stream1 和 3,task2 使用 stream2 和 4
start overall
start D1 on stream1
D1 on stream1
stop D1 on stream1
start D2 on stream2
D2 on stream2
stop D2 on stream2
start R1 on stream3
R1 on stream3
stop R1 on stream3
start R2 on stream4
R2 on stream4
stop R2 on stream4
start H1 on stream1
H1 on stream1
stop H1 on stream1
start H2 on stream2
H2 on stream2
stop H2 on stream2
stop overall
以下是所有场景的总体时序信息:场景 1 = 39.390240 场景 2 = 29.190241 场景 3 = 29.298208
我还附上了下面的 CUDA 代码:
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
{
C[i] = A[i] + B[N-i];
C[i] = A[i] + B[i] * 2;
C[i] = A[i] + B[i] * 3;
C[i] = A[i] + B[i] * 4;
C[i] = A[i] + B[i];
}
}
void overlap()
{
float* h_A;
float *d_A, *d_C;
float* h_A2;
float *d_A2, *d_C2;
int N = 10000000;
size_t size = N * sizeof(float);
cudaMallocHost((void**) &h_A, size);
cudaMallocHost((void**) &h_A2, size);
// Allocate vector in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_C, size);
cudaMalloc((void**)&d_A2, size);
cudaMalloc((void**)&d_C2, size);
float fTimCpyDev1, fTimKer1, fTimCpyHst1, fTimCpyDev2, fTimKer2, fTimCpyHst2;
float fTimOverall3, fTimOverall1, fTimOverall2;
for (int i = 0; i<N; ++i)
{
h_A[i] = 1;
h_A2[i] = 5;
}
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
cudaStream_t csStream1, csStream2, csStream3, csStream4;
cudaStreamCreate(&csStream1);
cudaStreamCreate(&csStream2);
cudaStreamCreate(&csStream3);
cudaStreamCreate(&csStream4);
cudaEvent_t ceEvStart, ceEvStop;
cudaEventCreate( &ceEvStart );
cudaEventCreate( &ceEvStop );
cudaEvent_t ceEvStartCpyDev1, ceEvStopCpyDev1, ceEvStartKer1, ceEvStopKer1, ceEvStartCpyHst1, ceEvStopCpyHst1;
cudaEventCreate( &ceEvStartCpyDev1 );
cudaEventCreate( &ceEvStopCpyDev1 );
cudaEventCreate( &ceEvStartKer1 );
cudaEventCreate( &ceEvStopKer1 );
cudaEventCreate( &ceEvStartCpyHst1 );
cudaEventCreate( &ceEvStopCpyHst1 );
cudaEvent_t ceEvStartCpyDev2, ceEvStopCpyDev2, ceEvStartKer2, ceEvStopKer2, ceEvStartCpyHst2, ceEvStopCpyHst2;
cudaEventCreate( &ceEvStartCpyDev2 );
cudaEventCreate( &ceEvStopCpyDev2 );
cudaEventCreate( &ceEvStartKer2 );
cudaEventCreate( &ceEvStopKer2 );
cudaEventCreate( &ceEvStartCpyHst2 );
cudaEventCreate( &ceEvStopCpyHst2 );
//Scenario1
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
cudaEventRecord(ceEvStartKer1, csStream1);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream1>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream1);
cudaEventRecord(ceEvStartKer2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream2>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream2);
cudaEventRecord(ceEvStartCpyHst1, csStream1);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall1, ceEvStart, ceEvStop);
printf("Scenario1 overall time= %10f\n", fTimOverall1);
//Scenario2
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartKer1, csStream1); //moved up
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream1>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream1);
cudaEventRecord(ceEvStartCpyHst1, csStream1); //moved up
cudaEventRecord(ceEvStartKer2, csStream2);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream2>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream2);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall2, ceEvStart, ceEvStop);
printf("Scenario2 overall time= %10f\n", fTimOverall2);
//Scenario3
cudaDeviceSynchronize();
cudaEventRecord(ceEvStart, 0);
cudaEventRecord(ceEvStartCpyDev1, csStream1);
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, csStream1);
cudaEventRecord(ceEvStopCpyDev1, csStream1);
cudaEventRecord(ceEvStartCpyDev2, csStream2);
cudaMemcpyAsync(d_A2, h_A2, size, cudaMemcpyHostToDevice, csStream2);
cudaEventRecord(ceEvStopCpyDev2, csStream2);
cudaStreamWaitEvent(csStream3, ceEvStopCpyDev1, 0);
cudaEventRecord(ceEvStartKer1, csStream3);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream3>>>(d_A, d_A, d_C, N);
cudaEventRecord(ceEvStopKer1, csStream3);
cudaStreamWaitEvent(csStream4, ceEvStopCpyDev2, 0);
cudaEventRecord(ceEvStartKer2, csStream4);
VecAdd<<<blocksPerGrid, threadsPerBlock, 0, csStream4>>>(d_A2, d_A2, d_C2, N);
cudaEventRecord(ceEvStopKer2, csStream4);
cudaStreamWaitEvent(csStream1, ceEvStopKer1, 0);
cudaEventRecord(ceEvStartCpyHst1, csStream1);
cudaMemcpyAsync(h_A, d_C, size, cudaMemcpyDeviceToHost, csStream1);
cudaEventRecord(ceEvStopCpyHst1, csStream1);
cudaStreamWaitEvent(csStream2, ceEvStopKer2, 0);
cudaEventRecord(ceEvStartCpyHst2, csStream2);
cudaMemcpyAsync(h_A2, d_C2, size, cudaMemcpyDeviceToHost, csStream2);
cudaEventRecord(ceEvStopCpyHst2, csStream2);
cudaEventRecord(ceEvStop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime( &fTimOverall3, ceEvStart, ceEvStop);
printf("Scenario3 overall time = %10f\n", fTimOverall3);
cudaStreamDestroy(csStream1);
cudaStreamDestroy(csStream2);
cudaStreamDestroy(csStream3);
cudaStreamDestroy(csStream4);
cudaFree(d_A);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFree(d_A2);
cudaFree(d_C2);
cudaFreeHost(h_A2);
}
int main()
{
overlap();
}
非常感谢您提前抽出时间!