我相信下面的代码执行一个典型的
- 复制到设备
- 调用内核
- 复制回主机
工作流程。
我发现非常奇怪的是,当我使用 NSight Profiler 的 Trace Application 选项时,在报告中,打开“堆栈跟踪”时,我发现最昂贵的操作是粗体线,而只是那条线,而其他 memoCopy 操作的成本几乎仅为该 memoCopy 操作的 10% 或更少。
这是因为它是调用内核之后的第一行,因此探查器以某种方式将某些同步的成本包含在此特定 memoCopy 操作的成本中?
对于像我正在处理的问题,需要非常频繁的同步并将结果“返回”给主机,有人可以就最佳实践提供一些一般性建议吗?我特别考虑了两个选项,我不确定最终是否会有所帮助
- 使用“零拷贝”内存,(示例 11.2 中的 CUDA)
- 使用原子操作创建我的同步方式
{
int numP = p_psPtr->P.size();
int numL = p_psPtr->L.size();
// Out partition is in Unit of the Number of Particles
int block_dim = BLOCK_DIM_X;
int grid_dim = numP/block_dim + (numP%block_dim == 0 ? 0:1);
vector<Particle> pVec(p_psPtr->P.begin(), p_psPtr->P.end());
Particle *d_part_arr = 0;
Particle *part_arr = pVec.data();
HANDLE_ERROR(cudaMalloc((void**)&d_part_arr, numP * sizeof(Particle)));
HANDLE_ERROR(cudaMemcpy(d_part_arr, part_arr, numP * sizeof(Particle), cudaMemcpyHostToDevice));
vector<SpringLink> lVec(p_psPtr->L.begin(), p_psPtr->L.end());
SpringLink *d_link_arr = 0;
SpringLink *link_arr = lVec.data();
HANDLE_ERROR(cudaMalloc((void**)&d_link_arr, numL * sizeof(SpringLink)));
HANDLE_ERROR(cudaMemcpy(d_link_arr, link_arr, numL * sizeof(SpringLink), cudaMemcpyHostToDevice));
Point3D *d_oriPos_arr = 0;
Point3D *oriPos_arr = p_originalPos.data();
HANDLE_ERROR(cudaMalloc((void**)&d_oriPos_arr, numP * sizeof(Point3D)));
HANDLE_ERROR(cudaMemcpy(d_oriPos_arr, oriPos_arr, numP * sizeof(Point3D), cudaMemcpyHostToDevice));
Vector3D *d_oriVel_arr = 0;
Vector3D *oriVel_arr = p_originalVel.data();
HANDLE_ERROR(cudaMalloc((void**)&d_oriVel_arr, numP * sizeof(Vector3D)));
HANDLE_ERROR(cudaMemcpy(d_oriVel_arr, oriVel_arr, numP * sizeof(Vector3D), cudaMemcpyHostToDevice));
Point3D *d_updPos_arr = 0;
Point3D *updPos_arr = p_updatedPos.data();
HANDLE_ERROR(cudaMalloc((void**)&d_updPos_arr, numP * sizeof(Point3D)));
HANDLE_ERROR(cudaMemcpy(d_updPos_arr, updPos_arr, numP * sizeof(Point3D), cudaMemcpyHostToDevice));
Vector3D *d_updVel_arr = 0;
Vector3D *updVel_arr = p_updatedVel.data();
HANDLE_ERROR(cudaMalloc((void**)&d_updVel_arr, numP * sizeof(Vector3D)));
HANDLE_ERROR(cudaMemcpy(d_updVel_arr, updVel_arr, numP * sizeof(Vector3D), cudaMemcpyHostToDevice));
int *d_converged_arr = 0;
int *converged_arr = &p_converged[0];
HANDLE_ERROR(cudaMalloc((void**)&d_converged_arr, numP * sizeof(int)));
HANDLE_ERROR(cudaMemcpy(d_converged_arr, converged_arr, numP * sizeof(int), cudaMemcpyHostToDevice));
// Run the function on the device
handleParticleKernel<<<grid_dim, block_dim>>>(d_part_arr, d_link_arr, numP,
d_oriPos_arr, d_oriVel_arr, d_updPos_arr, d_updVel_arr,
d_converged_arr, p_innerLoopIdx, p_dt);
**HANDLE_ERROR(cudaMemcpy(oriPos_arr, d_oriPos_arr, numP * sizeof(Point3D), cudaMemcpyDeviceToHost));**
HANDLE_ERROR(cudaMemcpy(oriVel_arr, d_oriVel_arr, numP * sizeof(Vector3D), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(updPos_arr, d_updPos_arr, numP * sizeof(Point3D), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(updVel_arr, d_updVel_arr, numP * sizeof(Vector3D), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(converged_arr, d_converged_arr, numP * sizeof(int), cudaMemcpyDeviceToHost));
}