6

我正在使用具有计算能力1.3GPU 的 CUDA 实现一个应用程序,该应用程序涉及扫描二维数组以查找出现较小二维数组的位置。到目前为止,这两个数组都是使用分配cudaMallocPitch()和传输使用cudaMemcpy2D()来满足合并的内存对齐要求的。

在第一个优化步骤中,我试图通过将数据集中读取到共享内存来合并对全局内存的内存访问。作为未优化代码中的测试(例如,存在分歧分支并且对全局内存的内存访问未合并),我使用分配了更大的数组cudaMalloc(),发现性能提高了高达50%. 这怎么可能?

4

2 回答 2

3

cudaMallocPitch() 确保二维数组(row-major)中每一行的起始地址是 2^N 的倍数(N 为 7~10,具体取决于计算能力)。

访问是否更高效不仅取决于数据对齐方式,还取决于您的计算能力、全局 mem 访问方式,有时还取决于缓存配置。

此博客解释了早期计算能力上未对齐的数据访问的带宽大幅减少,这可能是您的 Q 中的 A。

https://developer.nvidia.com/content/how-access-global-memory-efficiently-cuda-cc-kernels

由于性能取决于许多因素,您可能必须发布您的设备模块类型和内核代码,以便进一步调查。

于 2013-02-06T02:26:07.997 回答
2

正如康世印已经指出的那样,使用 带来的改进cudaMallocPitch取决于计算能力,并且预计对于较旧的计算能力更为显着。然而,对于最近的计算能力,倾斜的内存分配似乎并没有带来相关的加速。

下面的代码提供了使用非间距存储器和间距存储器之间的性能测试平台。特别是,该代码执行三个(非音高或音高)矩阵之间的求和。处理三个矩阵的原因是需要突出内存事务与计算相比,以便突出非间距分配和间距分配之间的差异。下面是一张GTX 960牌和一张GT 920M牌的计时结果。

GTX 960

Non-pitched - Time = 3.242208; Memory = 65320000 bytes
Pitched     - Time = 3.150944; Memory = 65433600 bytes

GT 920M

Non-pitched - Time = 20.496799; Memory = 65320000 bytes
Pitched     - Time = 20.418560; Memory = 65433600 bytes

可以看出,两张卡的两种实现方式并没有太大区别。上述结果还表明,由于使用了间距内存分配,内存占用率有所增加。

这是代码:

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16

/******************/
/* TEST KERNEL 2D */
/******************/
__global__ void test_kernel_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const int Nrows, const int Ncols)
{
    int    tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int    tidy = blockIdx.y * blockDim.y + threadIdx.y;

    if ((tidx < Ncols) && (tidy < Nrows)) {
        devPtrA[tidy * Ncols + tidx] = devPtrA[tidy * Ncols + tidx] + devPtrB[tidy * Ncols + tidx] + devPtrC[tidy * Ncols + tidx];
    }
}

/**************************/
/* TEST KERNEL PITCHED 2D */
/**************************/
__global__ void test_kernel_Pitched_2D(float * __restrict__ devPtrA, float * __restrict__ devPtrB, float * __restrict__ devPtrC, const size_t pitchA, const size_t pitchB, const size_t pitchC, const int Nrows, const int Ncols)
{
    int    tidx = blockIdx.x * blockDim.x + threadIdx.x;
    int    tidy = blockIdx.y * blockDim.y + threadIdx.y;

    if ((tidx < Ncols) && (tidy < Nrows))
    {
        float *row_a = (float *)((char*)devPtrA + tidy * pitchA);
        float *row_b = (float *)((char*)devPtrB + tidy * pitchB);
        float *row_c = (float *)((char*)devPtrC + tidy * pitchC);
        row_a[tidx] = row_a[tidx] + row_b[tidx] + row_c[tidx];
    }
}

/********/
/* MAIN */
/********/
int main()
{
    const int Nrows = 7100;
    const int Ncols = 2300;

    TimingGPU timerGPU;

    float *hostPtrA = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *hostPtrB = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *hostPtrC = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *devPtrA, *devPtrPitchedA;
    float *devPtrB, *devPtrPitchedB;
    float *devPtrC, *devPtrPitchedC;
    size_t pitchA, pitchB, pitchC;

    for (int i = 0; i < Nrows; i++)
        for (int j = 0; j < Ncols; j++) {
        hostPtrA[i * Ncols + j] = 1.f;
        hostPtrB[i * Ncols + j] = 2.f;
        hostPtrC[i * Ncols + j] = 3.f;
        //printf("row %i column %i value %f \n", i, j, hostPtr[i][j]);
        }

    // --- 2D non-pitched allocation and host->device memcopy
    gpuErrchk(cudaMalloc(&devPtrA, Nrows * Ncols * sizeof(float)));
    gpuErrchk(cudaMalloc(&devPtrB, Nrows * Ncols * sizeof(float)));
    gpuErrchk(cudaMalloc(&devPtrC, Nrows * Ncols * sizeof(float)));
    gpuErrchk(cudaMemcpy(devPtrA, hostPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(devPtrB, hostPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(devPtrC, hostPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

    // --- 2D pitched allocation and host->device memcopy
    gpuErrchk(cudaMallocPitch(&devPtrPitchedA, &pitchA, Ncols * sizeof(float), Nrows));
    gpuErrchk(cudaMallocPitch(&devPtrPitchedB, &pitchB, Ncols * sizeof(float), Nrows));
    gpuErrchk(cudaMallocPitch(&devPtrPitchedC, &pitchC, Ncols * sizeof(float), Nrows));
    gpuErrchk(cudaMemcpy2D(devPtrPitchedA, pitchA, hostPtrA, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy2D(devPtrPitchedB, pitchB, hostPtrB, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy2D(devPtrPitchedC, pitchC, hostPtrC, Ncols * sizeof(float), Ncols*sizeof(float), Nrows, cudaMemcpyHostToDevice));

    dim3 gridSize(iDivUp(Ncols, BLOCKSIZE_x), iDivUp(Nrows, BLOCKSIZE_y));
    dim3 blockSize(BLOCKSIZE_y, BLOCKSIZE_x);

    timerGPU.StartCounter();
    test_kernel_2D << <gridSize, blockSize >> >(devPtrA, devPtrB, devPtrC, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Non-pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * Ncols * sizeof(float));

    timerGPU.StartCounter();
    test_kernel_Pitched_2D << <gridSize, blockSize >> >(devPtrPitchedA, devPtrPitchedB, devPtrPitchedC, pitchA, pitchB, pitchC, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Pitched - Time = %f; Memory = %i bytes \n", timerGPU.GetCounter(), Nrows * pitchA);

    //gpuErrchk(cudaMemcpy2D(hostPtr, Ncols * sizeof(float), devPtrPitched, pitch, Ncols * sizeof(float), Nrows, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(hostPtrA, devPtrA, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(hostPtrB, devPtrB, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(hostPtrC, devPtrC, Nrows * Ncols * sizeof(float), cudaMemcpyDeviceToHost));

    //for (int i = 0; i < Nrows; i++) 
    //  for (int j = 0; j < Ncols; j++) 
    //      printf("row %i column %i value %f \n", i, j, hostPtr[i * Ncols + j]);

    return 0;

}
于 2017-05-08T17:04:34.177 回答