cuda - NVIDIA Kepler 架构的 ILP 效率和性能

Question

引用 NVIDIA 提供的“Kepler Tuning Guide”：

另请注意，Kepler GPU 可以比 Fermi GPU 更容易地利用 ILP 代替线程/warp 级并行 (TLP)。

在我看来，以下代码片段

a = .....;
a2 = f(a); 
a3 = g(a2);

可以改进如下

a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);

所以在我的项目中，我有一段代码如下（示例1）

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        src.ptr(y)[x] = make_short4(0,0,0,0);
    }
}

我将其重写如下（example2）

if(x < src.cols && y < src.rows)
{
    if(!mask(y,x))
    {
        short4 t;
        t.x = 0;
        t.y = 0;
        t.z = 0;
        t.w = 0;
        src.ptr(y)[x].x = t.x;
        src.ptr(y)[x].y = t.y;
        src.ptr(y)[x].z = t.z;
        src.ptr(y)[x].w = t.w;  
     }
}

在 Kepler 架构中，example2 会比 example1 更高效，表现出更好的性能，对吗？

score 3 · Accepted Answer

可以在CUDA Performance: Maximizing Instruction-Level Parallelism中找到有关指令级并行(ILP)的一个很好的解释。

Robert Crovella 和 talonmies 已经指出，并且您自己已经认识到，您上面的示例没有达到 ILP。

关于如何实现 ILP，我在下面展示了经典示例，该示例从numbapro-examples的 PyCUDA 代码翻译而来，我已经针对 Fermi 和 Kepler GPU 进行了测试。请注意，对于后一种情况，我没有观察到相关的加速。

编码

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE 64

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    d_c[i] = d_a[i] + d_b[i];

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;

}

/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {

    // --- Loading the data
    int i = threadIdx.x + blockIdx.x * blockDim.x;

    float ai = d_a[i];
    float bi = d_b[i];

    int stride = gridDim.x * blockDim.x;

    int j = i + stride;
    float aj = d_a[j];
    float bj = d_b[j];

    int k = j + stride;
    float ak = d_a[k];
    float bk = d_b[k];

    int l = k + stride;
    float al = d_a[l];
    float bl = d_b[l];

    int m = l + stride;
    float am = d_a[m];
    float bm = d_b[m];

    int n = m + stride;
    float an = d_a[n];
    float bn = d_b[n];

    int p = n + stride;
    float ap = d_a[p];
    float bp = d_b[p];

    int q = p + stride;
    float aq = d_a[q];
    float bq = d_b[q];

    // --- Computing
    float ci = ai + bi;
    float cj = aj + bj;
    float ck = ak + bk;
    float cl = al + bl;
    float cm = am + bm;
    float cn = an + bn;
    float cp = ap + bp;
    float cq = aq + bq;

    // --- Writing the data
    d_c[i] = ci;
    d_c[j] = cj;
    d_c[k] = ck;
    d_c[l] = cl;
    d_c[m] = cm;
    d_c[n] = cn;
    d_c[p] = cp;
    d_c[q] = cq;

}

/********/
/* MAIN */
/********/
void main() {

    float timing;
    cudaEvent_t start, stop;

    const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE

    float* a = (float*)malloc(N*sizeof(float));
    float* b = (float*)malloc(N*sizeof(float));
    float* c = (float*)malloc(N*sizeof(float));
    float* c_ref = (float*)malloc(N*sizeof(float));

    srand(time(NULL));
    for (int i=0; i<N; i++) {

        a[i] = rand() / RAND_MAX;
        b[i] = rand() / RAND_MAX;
        c_ref[i] = a[i] + b[i];

    }

    float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
    float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
    float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
    float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
    float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
    float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));

    /******************/
    /* ILP0 TEST CASE */
    /******************/
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP0:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP2 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP2:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP4 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP4:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

    /******************/
    /* ILP8 TEST CASE */
    /******************/
    cudaEventRecord(start, 0);
    ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&timing, start, stop);
    printf("Elapsed time - ILP8:  %3.3f ms \n", timing);

    gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Checking the results
    for (int i=0; i<N; i++)
        if (c[i] != c_ref[i]) {

            printf("%f %f\n",c[i],c_ref[i]);
            printf("Error!\n");
            return;

        }

    printf("Test passed!\n");

}

表现

Card                    Kernel          Time [ms]            Speedup
GeForce GT540M          ILP0            4.609                1
      "                 ILP2            2.666                1.72
      "                 ILP4            1.675                2.76
      "                 ILP8            1.477                3.12

Kepler K20c             ILP0            0.045                
      "                 ILP2            0.043                
      "                 ILP4            0.043                
      "                 ILP8            0.042

cuda - NVIDIA Kepler 架构的 ILP 效率和性能

1 回答 1

Related

Reference