正如康世印已经指出的那样,使用 带来的改进cudaMallocPitch
取决于计算能力,并且预计对于较旧的计算能力更为显着。然而,对于最近的计算能力,倾斜的内存分配似乎并没有带来相关的加速。
下面的代码提供了使用非间距存储器和间距存储器之间的性能测试平台。特别是,该代码执行三个(非音高或音高)矩阵之间的求和。处理三个矩阵的原因是需要突出内存事务与计算相比,以便突出非间距分配和间距分配之间的差异。下面是一张GTX 960
牌和一张GT 920M
牌的计时结果。
GTX 960
Non-pitched - Time = 3.242208; Memory = 65320000 bytes
Pitched - Time = 3.150944; Memory = 65433600 bytes
GT 920M
Non-pitched - Time = 20.496799; Memory = 65320000 bytes
Pitched - Time = 20.418560; Memory = 65433600 bytes
可以看出,两张卡的两种实现方式并没有太大区别。上述结果还表明,由于使用了间距内存分配,内存占用率有所增加。
这是代码:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const int Nrows, const int Ncols)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows)) {
devPtrA[tidy * Ncols + tidx] = devPtrA[tidy * Ncols + tidx] + devPtrB[tidy * Ncols + tidx] + devPtrC[tidy * Ncols + tidx];
}
}
/**************************/
/* TEST KERNEL PITCHED 2D */
/**************************/
__global__ void test_kernel_Pitched_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const size_t pitchA, const size_t pitchB, const size_t pitchC, const int Nrows, const int Ncols)
{
int tidx = blockIdx.x * blockDim.x + threadIdx.x;
int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Ncols) && (tidy < Nrows))
{
float *row_a = (float *)((char*)devPtrA + tidy * pitchA);
float *row_b = (float *)((char*)devPtrB + tidy * pitchB);
float *row_c = (float *)((char*)devPtrC + tidy * pitchC);
row_a[tidx] = row_a[tidx] + row_b[tidx] + row_c[tidx];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int Nrows = 7100;
const int Ncols = 2300;
TimingGPU timerGPU;
float *hostPtrA = (float *)malloc(Nrows * Ncols * sizeof(float));
float *hostPtrB = (float *)malloc(Nrows * Ncols * sizeof(float));
float *hostPtrC = (float *)malloc(Nrows * Ncols * sizeof(float));
float *devPtrA, *devPtrPitchedA;
float *devPtrB, *devPtrPitchedB;
float *devPtrC, *devPtrPitchedC;
size_t pitchA, pitchB, pitchC;
for (int i = 0; i < Nrows; i++)
for (int j = 0; j < Ncols; j++) {
hostPtrA[i * Ncols + j] = 1.f;
hostPtrB[i * Ncols + j] = 2.f;
hostPtrC[i * Ncols + j] = 3.f;
//printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
}
// --- 2D non-pitched allocation and host->device memcopy
gpuErrchk(cudaMalloc(&devPtrA, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMalloc(&devPtrB, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMalloc(&devPtrC, Nrows * Ncols * sizeof(float)));
gpuErrchk(cudaMemcpy(devPtrA, hostPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devPtrB, hostPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devPtrC, hostPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
// --- 2D pitched allocation and host->device memcopy
gpuErrchk(cudaMallocPitch(&devPtrPitchedA, &pitchA, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMallocPitch(&devPtrPitchedB, &pitchB, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMallocPitch(&devPtrPitchedC, &pitchC, Ncols * sizeof(float), Nrows));
gpuErrchk(cudaMemcpy2D(devPtrPitchedA, pitchA, hostPtrA, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy2D(devPtrPitchedB, pitchB, hostPtrB, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy2D(devPtrPitchedC, pitchC, hostPtrC, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);
timerGPU.StartCounter();
test_kernel_2D << <gridSize, blockSize >> >(devPtrA, devPtrB, devPtrC, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Non-pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * Ncols * sizeof(float));
timerGPU.StartCounter();
test_kernel_Pitched_2D << <gridSize, blockSize >> >(devPtrPitchedA, devPtrPitchedB, devPtrPitchedC, pitchA, pitchB, pitchC, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * pitchA);
//gpuErrchk(cudaMemcpy2D(hostPtr, Ncols * sizeof(float), devPtrPitched, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrA, devPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrB, devPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(hostPtrC, devPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
//for (int i = 0; i < Nrows; i++)
// for (int j = 0; j < Ncols; j++)
// printf("row %i column %i value %f \n", i, j, hostPtr[i * Ncols + j]);
return 0;
}