cuda - 相对较小的阵列大小的 GPU 精度问题？

Question

我有一些 CUDA 代码可以执行一些线性代数来反转一种特殊类型的结构化矩阵。我使用算法的序列化版本的结果来计算 RMS 误差。错误随着问题的大小而增长，其程度超出了我的预期。任何人都可以提供有关为什么会这样的见解吗？

GPU 代码非常幼稚。这是故意的，我很快就会优化它——我只想要一个简单的基线内核，它可以给出正确的结果。

 __global__ void levinson_durbin_gpu(TYPE *h0_d, TYPE *h_d, TYPE *v_d, TYPE *x_d, TYPE *y_d, int N) //Naive kernel
{
    int j = threadIdx.x;
    int i;

    __shared__ TYPE hn_1[512]; 
    hn_1[j] = h_d[j];

    for(i=1; i<N; i++)
    {
        if(j < i)
        {
            TYPE hn = h_d[i];
            TYPE yn = y_d[i];

            __syncthreads();

            //Set up temporary arrays, compute inner products
            __shared__ TYPE temp[512]; //Temp for hn_1_J_v
            __shared__ TYPE temp2[512]; //Temp for hn_1_J_x
            __shared__ TYPE temp3[512]; //Temp for hn_1_v

            temp[j] = hn_1[j]*v_d[i-j-1];
            temp2[j] = hn_1[j]*x_d[i-j-1];
            temp3[j] = hn_1[j]*v_d[j];
            __syncthreads();

            //Three reductions at once
            for(unsigned int s=1; s<i; s*=2)
            {
                int index = 2*s*j;
                if((index+s) < i)
                {
                    temp[index] += temp[index+s];
                    temp2[index] += temp2[index+s];
                    temp3[index] += temp3[index+s];
                }
                __syncthreads();
            }

            TYPE hn_1_J_v = temp[0];
            TYPE hn_1_J_x = temp2[0];
            TYPE hn_1_v = temp3[0];

            TYPE alpha_v = (hn - hn_1_J_v)/(h0_d[0] - hn_1_v);
            TYPE alpha_x = (yn - hn_1_J_x)/(h0_d[0] - hn_1_v);

            __shared__ TYPE w_v[512];
            w_v[j] = v_d[j] - alpha_v*v_d[i-j-1];

            __shared__ TYPE w_x[512];
            w_x[j] = x_d[j] - alpha_x*v_d[i-j-1];

            v_d[j] = w_v[j];
            x_d[j] = w_x[j];
            if(j == 0)
            {
                v_d[i] = alpha_v;
                x_d[i] = alpha_x;
            }


        }

        __syncthreads();
    }

}

标识符TYPE是浮点数或双精度数，具体取决于我编译代码的方式。我正在使用 1 个带有 N 个线程的块（再次，在这里保持天真和简单）。使用单精度，我看到以下结果：

N=4：RMS 误差 = 0.0000000027
N=8：RMS 误差 = 0.0000001127
N=16：RMS 误差 = 0.0000008832
N=32：RMS 误差 = 0.0000009233
N=64：RMS 误差 = 42.0136776452
N=80：RMS 误差 = 281438717。

我不知道这是我的算法错误还是某种精度问题。如果有帮助，我可以使用双精度、算法的 CPU 版本或计算 RMS 误差的代码来显示上述结果。我正在使用 GeForce GTX 660 Ti (cc 3.0) GPU。变量x_d包含最终结果。

score 1 · Accepted Answer

感谢评论部分的帮助，我自己解决了这个问题，所以我会在这里记录下来，以防其他人遇到类似的问题。

问题确实是同步问题——我__syncthreads()在不同的控制流块中使用。解决方案是将控制流块分成多个部分并__syncthreads()在每个部分之后调用：

__global__ void levinson_durbin_gpu(TYPE *h0_d, TYPE *h_d, TYPE *v_d, TYPE *x_d, TYPE *y_d, int N) //Naive kernel
{
    int j = threadIdx.x;
    int i;

    __shared__ TYPE hn_1[512];
    hn_1[j] = h_d[j];
    __syncthreads();

    //Set up temporary arrays
    __shared__ TYPE temp[512]; //Temp for hn_1_J_v
    __shared__ TYPE temp2[512]; //Temp for hn_1_J_x
    __shared__ TYPE temp3[512]; //Temp for hn_1_v

    TYPE hn;
    TYPE yn;

    for(i=1; i<N; i++)
    {
        if(j < i)
        {
            hn = h_d[i];
            yn = y_d[i];

            //Compute inner products
            temp[j] = hn_1[j]*v_d[i-j-1];
            temp2[j] = hn_1[j]*x_d[i-j-1];
            temp3[j] = hn_1[j]*v_d[j];
        }

        __syncthreads();

        //Have all threads complete this section to avoid synchronization issues

        //Three reductions at once
        for(unsigned int s=1; s<i; s*=2)
        {
            int index = 2*s*j;
            if((index+s) < i)
            {
                temp[index] += temp[index+s];
                temp2[index] += temp2[index+s];
                temp3[index] += temp3[index+s];
            }
            __syncthreads();
        }

        if(j < i)
        {
            TYPE hn_1_J_v = temp[0];
            TYPE hn_1_J_x = temp2[0];
            TYPE hn_1_v = temp3[0];

            TYPE alpha_v = (hn - hn_1_J_v)/(h0_d[0] - hn_1_v);
            TYPE alpha_x = (yn - hn_1_J_x)/(h0_d[0] - hn_1_v);

            __shared__ TYPE w_v[512];
            w_v[j] = v_d[j] - alpha_v*v_d[i-j-1];

            __shared__ TYPE w_x[512];
            w_x[j] = x_d[j] - alpha_x*v_d[i-j-1];

            v_d[j] = w_v[j];
            x_d[j] = w_x[j];
            if(j == 0)
            {
                v_d[i] = alpha_v;
                x_d[i] = alpha_x;
            }
        }

    __syncthreads();
}

}

N=32：RMS 误差 = 0.0000009233
N=64：RMS 误差 = 0.0000027644
N=128：RMS 误差 = 0.0000058276
N=256：RMS 误差 = 0.0000117755
N=512：RMS 误差 = 0.0000237040

我学到了什么：在 CUDA 中使用同步机制时，请确保所有线程都达到相同的障碍点！我觉得这种事情应该会产生编译器警告。

cuda - 相对较小的阵列大小的 GPU 精度问题？

1 回答 1

Related

Reference