假设 GPU 有一个执行引擎和一个复制引擎。
- 当在 CUDA 内核中线程访问主机内存时,它会使复制引擎忙碌吗?因此,它是否会阻止其他流中与设备之间的所有异步内存复制操作?
- 如果在 CUDA 内核内部线程访问对等设备内存,是否会使两个设备中的复制引擎都忙?
我试图只回答第一个问题
当在 CUDA 内核中线程访问主机内存时,它会使复制引擎忙碌吗?因此,它是否会阻止其他流中与设备之间的所有异步内存复制操作?
我已经写下了下面的简单代码。它包含两个内核,一个显式使用映射固定主机内存,即kernel2
,另一个不显式使用映射固定主机内存,即kernel1
. 该代码使用三个流来检查映射固定主机内存的使用是否会破坏并发性。
这是代码:
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
using namespace std;
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************************/
/* KERNEL FUNCTION - VERSION 1 */
/*******************************/
__global__ void kernel1(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
/*******************************/
/* KERNEL FUNCTION - VERSION 2 */
/*******************************/
__global__ void kernel2(const int *in, int *out, int* cnt, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = cnt[i] * in[i] * in[i];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int dataSize = 6000000;
// --- Host side memory allocations
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
// --- Host side memory initialization
for(int i = 0; i < dataSize; i++) h_in[i] = 5;
for(int i = 0; i < dataSize; i++) h_out[i] = 0;
// --- Registers host memory as page-locked, as required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in, dataSize * sizeof(int), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out, dataSize * sizeof(int), cudaHostRegisterPortable));
// --- Device side memory allocations
int *d_in = 0; gpuErrchk(cudaMalloc((void**)&d_in, dataSize * sizeof(int)));
int *d_out = 0; gpuErrchk(cudaMalloc((void**)&d_out, dataSize * sizeof(int)));
// --- Testing mapped pinned memory
int *cnt; gpuErrchk(cudaMallocHost((void**)&cnt, dataSize * sizeof(int)));
for(int i = 0; i < dataSize; i++) cnt[i] = 2;
int streamSize = dataSize / NUM_STREAMS;
size_t streamMemSize = dataSize * sizeof(int) / NUM_STREAMS;
// --- Setting kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
// --- Create CUDA streams
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
/**********/
/* CASE 1 */
/**********/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel1<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
/**********/
/* CASE 2 */
/**********/
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], cnt, streamSize/2);
kernel2<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], cnt, streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++) {
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); }
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Release resources
gpuErrchk(cudaHostUnregister(h_in));
gpuErrchk(cudaHostUnregister(h_out));
gpuErrchk(cudaFree(d_in));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in;
delete[] h_out;
gpuErrchk(cudaDeviceReset());
return 0;
}
从下面的时间线来看,映射固定主机内存的使用kernel2
似乎不会破坏并发性。该算法已在具有单拷贝引擎的 GT540M 卡上进行了测试。