我做了一些关于在4
Kepler K20c GPU 集群上实现并发执行的实验。我已经考虑8
了测试用例,下面报告了其相应的代码以及分析器时间线。
测试用例 #1 - “广度优先”方法 - 同步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
double *inputMatrices = (double *)malloc(N * sizeof(double));
// --- "Breadth-first" approach - no async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpy(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
可以看出,使用cudaMemcpy
并不能在副本中实现并发,而是在内核执行中实现并发。
测试用例 #2 - “深度优先”方法 - 同步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
double *inputMatrices = (double *)malloc(N * sizeof(double));
// --- "Depth-first" approach - no async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpy(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
gpuErrchk(cudaMemcpy(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
这一次,在内存副本和内核执行中都没有实现并发。
测试用例 #3 - “深度优先”方法 - 带流的异步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
T *d_data;
T *h_data;
cudaStream_t stream;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
gpuErrchk(cudaStreamCreate(&plan.stream));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Depth-first" approach - async
for (int k = 0; k < numGPUs; k++)
{
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice, plan[k].stream));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE, 0, plan[k].stream>>>(plan[k].d_data, NperGPU);
gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost, plan[k].stream));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
正如预期的那样,实现了并发性。
测试用例 #4 - “深度优先”方法 - 默认流中的异步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
T *d_data;
T *h_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Depth-first" approach - no stream
for (int k = 0; k < numGPUs; k++)
{
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
尽管使用默认流,但实现了并发。
测试用例 #5 - “深度优先”方法 - 默认流和唯一宿主cudaMallocHost
ed 向量中的异步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Depth-first" approach - no stream
double *inputMatrices; gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
for (int k = 0; k < numGPUs; k++)
{
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
再次实现并发。
测试用例 #6 - 使用流异步复制的“广度优先”方法
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
T *h_data;
cudaStream_t stream;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
gpuErrchk(cudaStreamCreate(&plan.stream));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice, plan[k].stream));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE, 0, plan[k].stream>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost, plan[k].stream));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
实现了并发,如相应的“深度优先”方法。
测试用例 #7 - “广度优先”方法 - 默认流中的异步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
T *h_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
gpuErrchk(cudaMallocHost((void **)&plan.h_data, NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].h_data, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
实现了并发性,就像在相应的“深度优先”方法中一样。
测试用例 #8 - “广度优先”方法 - 默认流和唯一宿主cudaMallocHost
ed 向量中的异步复制
- 代码 -
#include "Utilities.cuh"
#include "InputOutput.cuh"
#define BLOCKSIZE 128
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<class T>
__global__ void kernelFunction(T * __restrict__ d_data, const unsigned int NperGPU) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < NperGPU) for (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];
}
/******************/
/* PLAN STRUCTURE */
/******************/
// --- Async
template<class T>
struct plan {
T *d_data;
};
/*********************/
/* SVD PLAN CREATION */
/*********************/
template<class T>
void createPlan(plan<T>& plan, unsigned int NperGPU, unsigned int gpuID) {
// --- Device allocation
gpuErrchk(cudaSetDevice(gpuID));
gpuErrchk(cudaMalloc(&(plan.d_data), NperGPU * sizeof(T)));
}
/********/
/* MAIN */
/********/
int main() {
const int numGPUs = 4;
const int NperGPU = 500000;
const int N = NperGPU * numGPUs;
plan<double> plan[numGPUs];
for (int k = 0; k < numGPUs; k++) createPlan(plan[k], NperGPU, k);
// --- "Breadth-first" approach - async
double *inputMatrices; gpuErrchk(cudaMallocHost(&inputMatrices, N * sizeof(double)));
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(plan[k].d_data, inputMatrices + k * NperGPU, NperGPU * sizeof(double), cudaMemcpyHostToDevice));
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
}
for (int k = 0; k < numGPUs; k++) {
gpuErrchk(cudaSetDevice(k));
gpuErrchk(cudaMemcpyAsync(inputMatrices + k * NperGPU, plan[k].d_data, NperGPU * sizeof(double), cudaMemcpyDeviceToHost));
}
gpuErrchk(cudaDeviceReset());
}
- 探查器时间表 -
实现了并发性,就像在相应的“深度优先”方法中一样。
结论
使用异步副本可以保证并发执行,无论是使用特意创建的流还是使用默认流。
注意
在上述所有示例中,我已经注意提供足够的工作来完成 GPU,无论是在副本方面还是在计算任务方面。未能为集群提供足够的工作可能会阻止观察并发执行。