math - 如何在支持 FMA 的 GPU 上改进浮点除法？

Question

当使用通过 SPIR-V（特别是 Vulkan）转换计算着色器的 API 为 GPU 编写计算代码时，我保证浮点除法的 ULP 误差最多为 3。其他基本算术（加法、乘法）是正确舍入。

在这些情况下如何有效地实现正确的舍入划分？让我们假设 FMA 可用且已正确舍入。

非规范化会发生什么将取决于底层硬件。Vulkan API 允许查询设备是否可以保留非正规以及是否可以将它们刷新为零（因此不完全支持非正规的 GPU 将具有“canPreserve: false, canFlush: true”）。因此，让我们另外假设 GPU 可以生成和处理非规范化，而不会将它们刷新为零（否则尝试生成不正常的正确舍入结果似乎是徒劳的）。

score 3 · Accepted Answer

现代基于 FMA 的除法算法通常严重依赖于广泛研究此问题的 Peter Markstein 的工作。规范参考是：

Peter Markstein，“IA-64 和基本函数：速度和精度”，Prentice-Hall 2000。

下面的 C 代码也是基于 Markstein 的工作。它基本上由一个快速处理常见情况的快速路径和一个处理所有讨厌的特殊情况的慢速路径组成。

所涉及的概念非常简单，但它们确实需要仔细的数学分析以确保正确舍入的结果。第一步是计算除数的准确倒数。这需要精确计算近似误差，而 FMA 是最好的工具。从（基于硬件的）近似中对倒数的细化通常采用具有二次收敛的单个 Newton-Raphson 迭代或具有三次收敛的单个 Halley 迭代的形式，两者都完美地映射到 FMA。

将除数的倒数乘以被除数就可以得到商的近似值。这同样基于基于 FMA 的残差计算进行了细化。在这个阶段，通常使用按比例缩放的被除数版本以防止中间计算中的上溢和下溢，并在除法算法的快速路径中允许尽可能宽的操作数范围。这也意味着最后需要一个最终的缩放步骤来调整商的大小。

通过代码的两个路径的有利安排是首先无条件地执行快速路径计算，然后在最后检查是否满足使用快速路径的条件，如果不满足，则调用慢速路径代码。这允许计算与快速路径计算并发的必要条件，并允许概述慢速路径计算，这有助于将该代码放置在收集各种慢速路径或异常处理代码的很少使用的内存页面中。

请将下面的代码视为算法草图。包含的基于随机测试向量的“烟雾”级测试远非严格的测试框架。调用慢速路径的条件既没有被证明是尽可能严格，也没有被证明是必要的严格，也不是最有效的安排。倒数的细化使用单个 Newton-Raphson 步骤，但这对于给定 GPU 的倒数近似可能不够，并且可能需要以额外的 FMA 为代价来替换 Halley 迭代。最后，我省略了整个慢速路径的构建，因为这将超出 Stackoverflow 答案的限制，无论是在设计工作量还是文本描述量方面。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

float rcp_approx (float a)
{
    return 1.0f / a; // fake it for now
}

float fp32_div_slowpath (float dividend, float divisor)
{
    return dividend / divisor; // fake it for now
}

float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_HI_LIMIT = 0x7e800000u; // 0x1.0p+126
    const uint32_t FP32_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx (divisor); 
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_recip = fmaf (recip_approx, recip_err, recip_approx);
    float quotient_mant = refined_recip * dividend_mant;
    float quotient_residual = fmaf (quotient_mant, -divisor, dividend_mant);
    float refined_quotient_mant = fmaf (refined_recip, quotient_residual, quotient_mant);
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t id = float_as_uint32 (divisor) & ~FP32_SIGN_MASK;
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((id > FP32_HI_LIMIT) || ((iq - FP32_LO_LIMIT) > (FP32_HI_LIMIT - FP32_LO_LIMIT))) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }

    return final_quotient;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    uint64_t count = 0;
    float divisor, dividend, res, ref;

    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu", count);

        dividend = uint32_as_float (KISS);
        divisor  = uint32_as_float (KISS);
        res = fp32_div (dividend, divisor);
        ref = dividend / divisor;
        if (float_as_uint32 (res) != float_as_uint32 (ref)) {
            printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                    dividend, divisor, dividend, divisor, res, ref, res, ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
    return EXIT_SUCCESS;
}

重新熟悉该问题后的一些补充说明。如果硬件的倒数逼近指令将次正规视为零，这是有利的。这允许在调用慢速路径代码时消除除数幅度检查，因为任何大于 2126 的除数^将导致倒数为零和商无穷大，从而触发商检查。

至于商检查，任何表示法线的快速路径结果都可以接受，除了最小的法线，它可能表示从次法线结果的不正确舍入。换句话说，绝对值在 [0x1.000002p-126, 0x1.fffffep+127] 中的商不应触发慢路径代码的重新计算。

只要使用足够准确的倒数，就不需要商细化步骤。涵盖 [1,2] 中的所有除数和 [1,2] 中的所有除数以及因此有效数字（尾数）模式的所有可能组合的详尽测试对于现代硬件是可行的，需要 2 ^{46 个}测试用例。即使标量代码在单个 CPU 内核上运行，它也可以在不到四天的时间内完成，并且没有报告错误。

在实际使用中，可能会希望强制编译器内联fp32_div()，同时强制fp_div_slowpath()进入被调用的子例程。

下面我使用上面讨论的简化对基于 AVX 的单精度除法版本进行了原型设计。它通过了我所有的测试。由于基于 AVX 的硬件倒数的精度较低，因此需要进行哈雷迭代以将倒数细化为完全单精度，从而提供最大误差接近 0.5 ulp 的结果。

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "immintrin.h"

#define MODE_RANDOM           (1)
#define MODE_MANT_EXHAUSTIVE  (2)
#define TEST_MODE             (MODE_RANDOM)

float uint32_as_float (uint32_t a) { float r; memcpy (&r, &a, sizeof r); return r; }
uint32_t float_as_uint32 (float a) { uint32_t r; memcpy (&r, &a, sizeof r); return r; }
float fp32_div_slowpath (float dividend, float divisor) { return dividend / divisor; }

/* 12-bit approximation. Subnormal arguments and results are treated as zero */
float rcp_approx_avx (float divisor)
{
    float r;
    __m256 t = _mm256_set1_ps (divisor);
    t = _mm256_rcp_ps (t);
    memcpy (&r, &t, sizeof r);
    return r;
}

float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_avx (divisor);
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float recip_err2 = fmaf (recip_err, recip_err, recip_err);
    float refined_recip = fmaf (recip_approx, recip_err2, recip_approx);
    float dividend_mant = uint32_as_float ((float_as_uint32 (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = uint32_as_float (float_as_uint32 (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = float_as_uint32 (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }
    return final_quotient;
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    uint64_t count = 0;
    float divisor, dividend, res, ref;

#if TEST_MODE == MODE_RANDOM
    do {
        if (!(count & 0xffffffULL)) printf ("\rcount: %llu", count);

        dividend = uint32_as_float (KISS);
        divisor  = uint32_as_float (KISS);
        res = fp32_div (dividend, divisor);
        ref = dividend / divisor;
        if (float_as_uint32 (res) != float_as_uint32 (ref)) {
            printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                    dividend, divisor,  dividend, divisor, res, ref, res, ref);
            return EXIT_FAILURE;
        }

        count++;
    } while (count);
#else
    for (dividend = 1.0f; dividend <= 2.0f; dividend = uint32_as_float (float_as_uint32 (dividend) + 1)) {
        printf ("\rdividend=%08x", float_as_uint32 (dividend));
        for (divisor = 1.0f; divisor <= 2.0f; divisor = uint32_as_float (float_as_uint32 (divisor) + 1)) {
            res = fp32_div (dividend, divisor);
            ref = dividend / divisor;
            if (float_as_uint32 (res) != float_as_uint32 (ref)) {
                printf ("error @ dividend=% 16.6a divisor=% 16.6a (% 15.8e, % 15.8e): res=% 16.6a ref=% 16.6a (% 15.8e, % 15.8e)\n", 
                        dividend, divisor,  dividend, divisor, res, ref, res, ref);
                return EXIT_FAILURE;
            }
        }
    }
#endif

    return EXIT_SUCCESS;
}

NVIDIA GPU 的相应 CUDA 代码（已测试）如下所示：

__noinline__ __device__ float fp32_div_slowpath (float dividend, float divisor) 
{ 
    return dividend / divisor; 
}

/* Subnormal arguments and results are treated as zero */
__forceinline__ __device__ float rcp_approx_gpu (float divisor)
{
    float r;
    asm ("rcp.approx.ftz.f32 %0,%1;\n\t" : "=f"(r) : "f"(divisor));
    return r;
}

__forceinline__ __device__ float fp32_div (float dividend, float divisor)
{
    const uint32_t FP32_MANT_MASK = 0x007fffffu;
    const uint32_t FP32_ONE = 0x3f800000u;
    const uint32_t FP32_SIGN_MASK = 0x80000000u;
    const uint32_t FP32_SIGN_EXPO_MASK = 0xff800000u;
    const uint32_t FP32_QUOTIENT_HI_LIMIT = 0x7f7fffffu; // 0x1.0p+128 - ulp
    const uint32_t FP32_QUOTIENT_LO_LIMIT = 0x00800001u; // 0x1.0p-126 + ulp

    // fast path
    float recip_approx = rcp_approx_gpu (divisor);
    float recip_err = fmaf (recip_approx, -divisor, 1.0f);
    float refined_recip = fmaf (recip_approx, recip_err, recip_approx);
    float dividend_mant = __int_as_float ((__float_as_int (dividend) & FP32_MANT_MASK) | FP32_ONE);
    float dividend_scale = __int_as_float (__float_as_int (dividend) & FP32_SIGN_EXPO_MASK);
    float refined_quotient_mant = refined_recip * dividend_mant;
    float refined_quotient_residual = fmaf (refined_quotient_mant, -divisor, dividend_mant);
    float final_quotient_mant = fmaf (refined_recip, refined_quotient_residual, refined_quotient_mant);
    float final_quotient = final_quotient_mant * dividend_scale;

    // check if we need to apply the slow path and invoke it if that is the case
    uint32_t iq = __float_as_int (final_quotient) & ~FP32_SIGN_MASK;
    if ((iq - FP32_QUOTIENT_LO_LIMIT) > (FP32_QUOTIENT_HI_LIMIT - FP32_QUOTIENT_LO_LIMIT)) {
        final_quotient = fp32_div_slowpath (dividend, divisor);
    }
    return final_quotient;
}

math - 如何在支持 FMA 的 GPU 上改进浮点除法？

1 回答 1

Related

Reference