回答您的问题“关于我可以做些什么来加速 GPU 代码的任何想法?”
首先,让我先声明一下,所提议的操作X = alpha * Y + beta * Z
对于所需的每字节数据传输没有大量的计算强度。结果,我无法在这个特定代码上击败 CPU 时间。但是,涵盖 2 个想法以加快此代码的速度可能是有益的:
使用页面锁定内存进行数据传输操作。这使 GPU 版本的数据传输时间减少了约 2 倍,而这在 GPU 版本的整体执行时间中占主导地位。
使用@njuffa在这里提出的cudaMemcpy2D跨步复制技术。结果是 2 倍:我们可以将数据传输量减少到仅计算所需的量,然后我们可以按照评论中的建议重新编写内核以连续操作数据(再次由 njuffa 提供) . 这使数据传输时间进一步提高了 3 倍,内核计算时间提高了约 10 倍。
此代码提供了这些操作的示例:
#include <stdio.h>
#include <stdlib.h>
#define THREADS_PER_BLOCK 1024
#define DSIZE 5000000
#define WSIZE 50000
#define XSTEP 47
#define YSTEP 43
#define ZSTEP 41
#define TOL 0.00001f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef float real;
__global__ void vectorStepAddKernel2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
{
x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
}
}
__global__ void vectorStepAddKernel2i(real *x, real *y, real *z, real alpha, real beta, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
{
x[i] = alpha* y[i] + beta*z[i];
}
}
void vectorStepAdd2(real *x, real *y, real *z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
int threadsPerBlock = THREADS_PER_BLOCK;
int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
vectorStepAddKernel2<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size, xstep, ystep, zstep);
cudaDeviceSynchronize();
cudaCheckErrors("kernel2 fail");
}
void vectorStepAdd2i(real *x, real *y, real *z, real alpha, real beta, int size)
{
int threadsPerBlock = THREADS_PER_BLOCK;
int blocksPerGrid = (size + threadsPerBlock -1)/threadsPerBlock;
vectorStepAddKernel2i<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, alpha, beta, size);
cudaDeviceSynchronize();
cudaCheckErrors("kernel3 fail");
}
//CPU function:
void vectorStepAdd3(real *x, real*y, real* z, real alpha, real beta, int size, int xstep, int ystep, int zstep)
{
for(int i=0;i<size;i++)
{
x[i*xstep] = alpha* y[i*ystep] + beta*z[i*zstep];
}
}
int main() {
real *h_x, *h_y, *h_z, *c_x, *h_x1;
real *d_x, *d_y, *d_z, *d_x1, *d_y1, *d_z1;
int dsize = DSIZE;
int wsize = WSIZE;
int xstep = XSTEP;
int ystep = YSTEP;
int zstep = ZSTEP;
real alpha = 0.5f;
real beta = 0.5f;
float et;
/*
h_x = (real *)malloc(dsize*sizeof(real));
if (h_x == 0){printf("malloc1 fail\n"); return 1;}
h_y = (real *)malloc(dsize*sizeof(real));
if (h_y == 0){printf("malloc2 fail\n"); return 1;}
h_z = (real *)malloc(dsize*sizeof(real));
if (h_z == 0){printf("malloc3 fail\n"); return 1;}
c_x = (real *)malloc(dsize*sizeof(real));
if (c_x == 0){printf("malloc4 fail\n"); return 1;}
h_x1 = (real *)malloc(dsize*sizeof(real));
if (h_x1 == 0){printf("malloc1 fail\n"); return 1;}
*/
cudaHostAlloc((void **)&h_x, dsize*sizeof(real), cudaHostAllocDefault);
cudaCheckErrors("cuda Host Alloc 1 fail");
cudaHostAlloc((void **)&h_y, dsize*sizeof(real), cudaHostAllocDefault);
cudaCheckErrors("cuda Host Alloc 2 fail");
cudaHostAlloc((void **)&h_z, dsize*sizeof(real), cudaHostAllocDefault);
cudaCheckErrors("cuda Host Alloc 3 fail");
cudaHostAlloc((void **)&c_x, dsize*sizeof(real), cudaHostAllocDefault);
cudaCheckErrors("cuda Host Alloc 4 fail");
cudaHostAlloc((void **)&h_x1, dsize*sizeof(real), cudaHostAllocDefault);
cudaCheckErrors("cuda Host Alloc 5 fail");
cudaMalloc((void **)&d_x, dsize*sizeof(real));
cudaCheckErrors("cuda malloc1 fail");
cudaMalloc((void **)&d_y, dsize*sizeof(real));
cudaCheckErrors("cuda malloc2 fail");
cudaMalloc((void **)&d_z, dsize*sizeof(real));
cudaCheckErrors("cuda malloc3 fail");
cudaMalloc((void **)&d_x1, wsize*sizeof(real));
cudaCheckErrors("cuda malloc4 fail");
cudaMalloc((void **)&d_y1, wsize*sizeof(real));
cudaCheckErrors("cuda malloc5 fail");
cudaMalloc((void **)&d_z1, wsize*sizeof(real));
cudaCheckErrors("cuda malloc6 fail");
for (int i=0; i< dsize; i++){
h_x[i] = 0.0f;
h_x1[i] = 0.0f;
c_x[i] = 0.0f;
h_y[i] = (real)(rand()/(real)RAND_MAX);
h_z[i] = (real)(rand()/(real)RAND_MAX);
}
cudaEvent_t t_start, t_stop, k_start, k_stop;
cudaEventCreate(&t_start);
cudaEventCreate(&t_stop);
cudaEventCreate(&k_start);
cudaEventCreate(&k_stop);
cudaCheckErrors("event fail");
// first test original GPU version
cudaEventRecord(t_start);
cudaMemcpy(d_x, h_x, dsize * sizeof(real), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
cudaMemcpy(d_y, h_y, dsize * sizeof(real), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 2 fail");
cudaMemcpy(d_z, h_z, dsize * sizeof(real), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 3 fail");
cudaEventRecord(k_start);
vectorStepAdd2(d_x, d_y, d_z, alpha, beta, wsize, xstep, ystep, zstep);
cudaEventRecord(k_stop);
cudaMemcpy(h_x, d_x, dsize * sizeof(real), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 4 fail");
cudaEventRecord(t_stop);
cudaEventSynchronize(t_stop);
cudaEventElapsedTime(&et, t_start, t_stop);
printf("GPU original version total elapsed time is: %f ms.\n", et);
cudaEventElapsedTime(&et, k_start, k_stop);
printf("GPU original kernel elapsed time is: %f ms.\n", et);
//now test CPU version
cudaEventRecord(t_start);
vectorStepAdd3(c_x, h_y, h_z, alpha, beta, wsize, xstep, ystep, zstep);
cudaEventRecord(t_stop);
cudaEventSynchronize(t_stop);
cudaEventElapsedTime(&et, t_start, t_stop);
printf("CPU version total elapsed time is: %f ms.\n", et);
for (int i = 0; i< dsize; i++)
if (fabsf((float)(h_x[i]-c_x[i])) > TOL) {
printf("cpu/gpu results mismatch at i = %d, cpu = %f, gpu = %f\n", i, c_x[i], h_x[i]);
return 1;
}
// now test improved GPU version
cudaEventRecord(t_start);
// cudaMemcpy2D(d_x1, sizeof(real), h_x, xstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
// cudaCheckErrors("cuda memcpy 5 fail");
cudaMemcpy2D(d_y1, sizeof(real), h_y, ystep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 6 fail");
cudaMemcpy2D(d_z1, sizeof(real), h_z, zstep * sizeof(real), sizeof(real), wsize, cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 7 fail");
cudaEventRecord(k_start);
vectorStepAdd2i(d_x1, d_y1, d_z1, alpha, beta, wsize);
cudaEventRecord(k_stop);
cudaMemcpy2D(h_x1, xstep*sizeof(real), d_x1, sizeof(real), sizeof(real), wsize, cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 8 fail");
cudaEventRecord(t_stop);
cudaEventSynchronize(t_stop);
cudaEventElapsedTime(&et, t_start, t_stop);
printf("GPU improved version total elapsed time is: %f ms.\n", et);
cudaEventElapsedTime(&et, k_start, k_stop);
printf("GPU improved kernel elapsed time is: %f ms.\n", et);
for (int i = 0; i< dsize; i++)
if (fabsf((float)(h_x[i]-h_x1[i])) > TOL) {
printf("gpu/gpu improved results mismatch at i = %d, gpu = %f, gpu imp = %f\n", i, h_x[i], h_x1[i]);
return 1;
}
printf("Results:i CPU GPU GPUi \n");
for (int i = 0; i< 20*xstep; i+=xstep)
printf(" %d %f %f %f %f %f\n",i, c_x[i], h_x[i], h_x1[i]);
return 0;
}
如前所述,我仍然无法超越 CPU 时间,我将此归因于我自己缺乏编码技能,或者此操作基本上没有足够的计算复杂性在 GPU 上引起兴趣。不过这里有一些示例结果:
GPU original version total elapsed time is: 13.352256 ms.
GPU original kernel elapsed time is: 0.195808 ms.
CPU version total elapsed time is: 2.599584 ms.
GPU improved version total elapsed time is: 4.228288 ms.
GPU improved kernel elapsed time is: 0.027392 ms.
Results:i CPU GPU GPUi
0 0.617285 0.617285 0.617285
47 0.554522 0.554522 0.554522
94 0.104245 0.104245 0.104245
....
我们可以看到改进后的内核与原始内核相比整体减少了大约 3 倍,这几乎都是由于数据复制时间的减少。数据复制时间的减少是由于使用改进的 2D memcpy,我们只需要复制我们实际使用的数据。(如果没有页面锁定的内存,这些数据传输时间大约会增加一倍)。我们还可以看到,内核计算时间比原始内核的 CPU 计算快约 10 倍,比改进内核的 CPU 计算快约 100 倍。然而,考虑到数据传输时间,我们无法克服 CPU 速度。
最后一条评论是 cudaMemcpy2D 操作的“成本”仍然很高。向量大小减少 100 倍,我们只看到复制时间减少了 3 倍。因此,跨步访问仍然是使用 GPU 的一种相对昂贵的方式。如果我们只是传输 50,000 个连续元素的向量,我们预计复制时间几乎线性减少 100 倍(与 5000000 个元素的原始复制向量相比)。这意味着复制的时间将少于 1 毫秒,并且我们的 GPU 版本将比 CPU 更快,至少在这种幼稚的单线程 CPU 代码中是这样。