c++ - C++ 的侧通道抗性数学函数

Question

我正在处理一个处理秘密数据的 SGX 项目，在某些时候，我需要评估浮点数的自然对数。评估过程应该是抗侧通道的，这意味着它的运行时间和内存访问模式将独立于它的输入和输出。

野外有这样的实现吗？文献中是否解决了这个问题？

score 5 · Accepted Answer

标签sgx表明您的硬件平台是最新的 x86_64 Intel CPU，它还支持 AVX2 和 FMA 操作。在运行时和内存访问模式中保持不变的实现的关键是避免分支。如果编译器配合并将简单的条件赋值转换为适当的条件移动或混合指令，则logf()下面的实现应该可以正常工作。然而，依赖编译器代码生成是脆弱的，在 Compiler Explorer 提供的各种编译器中，我只能让 clang 提供接近预期结果的东西，所有分支都转换除了一个（a = t处理中的条件分配非正规输入）。

因此，您可能必须通过适当的指令而不是分支代码来强制执行结果选择，例如通过使用内在函数。

正如EOF在评论中指出的那样，消除分支是必要但不充分的条件，因为单个浮点运算也可以具有可变的运行时间，即使它们只是加法、乘法和 FMA。这在高速处理诸如次正规（通常称为非正规）之类的特殊操作数的架构（例如 GPU）上不是问题。但是，这是我使用并使用过的 x86 处理器上的一个问题。通常，最严重的可变性是由于非正规结果而发生的，非正规源操作数的影响要小得多。

下面显示的代码包含使用原始参数a作为源操作数的多个操作，这使其面临由于异常输入而导致运行时变化的风险。是否应针对打算部署代码的特定平台仔细测试运行时的潜在变化是否超过噪声水平（例如，由于调用函数时管道状态的可变性）。

#include <cstdint>
#include <cstring>
#include <cmath>

int __float_as_int (float a)
{
    int r;
    memcpy (&r, &a, sizeof(r));
    return r;
}

float __int_as_float (int a)
{
    float r;
    memcpy (&r, &a, sizeof(r));
    return r;
}

/* maximum error 0.85417 ulp */
float my_logf (float a)
{
    float m, r, s, t, i, f, u;
    int32_t e;

    /* result for exceptional cases */
    u = a + a;  // silence NaNs if necessary
    if (a  < 0.0f) u =  0.0f / 0.0f; //  NaN
    if (a == 0.0f) u = -1.0f / 0.0f; // -Inf
    
    /* result for non-exceptional cases */
    i = 0.0f;

    /* fix up denormal input if needed */
    t = a * 8388608.0f;
    if (a < 1.17549435e-38f) {
        a = t;
        i = -23.0f;
    }

    /* split argument into exponent and mantissa parts */
    e = (__float_as_int (a) - 0x3f2aaaab) & 0xff800000;
    m = __int_as_float (__float_as_int (a) - e);
    i = fmaf ((float)e, 1.19209290e-7f, i);

    /* m in [2/3, 4/3] */
    f = m - 1.0f;
    s = f * f;
    /* Compute log1p(f) for f in [-1/3, 1/3] */
    r =             -0.130310059f; 
    t =              0.140869141f; 
    r = fmaf (r, s, -0.121489234f);
    t = fmaf (t, s,  0.139809728f);
    r = fmaf (r, s, -0.166844666f);
    t = fmaf (t, s,  0.200121239f);
    r = fmaf (r, s, -0.249996305f);
    r = fmaf (t, f, r);
    r = fmaf (r, f,  0.333331943f);
    r = fmaf (r, f, -0.500000000f);
    r = fmaf (r, s, f);
    r = fmaf (i, 0.693147182f, r); // log(2) 

    /* late selection between exceptional and non-exceptional result */
    if (!((a > 0.0f) && (a <= 3.40282347e+38f))) r = u;

    return r;
}

上面确定的潜在问题可以通过在对数计算中执行特殊情况处理和通过可移植的基于整数的代码进行结果选择来解决。明显的权衡是性能损失。处理非规范参数需要基于前导零 (CLZ) 的计数进行规范化。虽然 x86 处理器对此有说明，但它们可能无法从 C++ 以可移植方式访问。但是可以以直接的方式构建具有不变运行时的可移植实现。这导致了一个无分支的实现，我希望它可以与大多数编译器一起工作，但是仔细检查生成的机器代码将是必不可少的。我使用 Compiler Explorer 来验证它是否可以使用gcc 11.1 和 clang 12.0.1进行编译

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

/* reinterpret bit pattern of IEEE-754 binary32 as a 32-bit unsigned integer */ 
int32_t float_as_int32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* reinterpret bit pattern of a 32-bit unsigned integer as IEEE-754 binary32 */
float int32_as_float (int32_t a)
{
    float r; 
    memcpy (&r, &a, sizeof r); 
    return r;
}

/* branch free implementation of ((cond) ? a : b). cond must be in {0,1} */
int32_t mux (int cond, int32_t a, int32_t b)
{
    return (1 - cond) * b + cond * a;
}

/* leading zero count with invariant runtime */
int clz (uint32_t a)
{
    // Algorithm by aqrit, https://stackoverflow.com/a/58827596/780717
    int n = 158 - (((uint32_t)float_as_int32 ((float)(int32_t)(a & ~(a >> 1)))) >> 23);
    n = mux (n < 0, 0, n);   // clamp below
    n = mux (n > 32, 32, n); // clamp above
    return n;
}

/* Compute natural logarithm with a maximum error of 0.85089 ulp */
float my_logf (float a)
{
    float m, r, s, t, i, f;
    int32_t e, ia, ii, it, iu, im, shift, excp;
    const int32_t abs_mask   = 0x7fffffffu;
    const int32_t qnan_bit   = 0x00400000u;
    const int32_t pos_infty  = 0x7f800000u;
    const int32_t neg_infty  = 0xff800000u;
    const int32_t indefinite = 0xffc00000u;
    const int32_t zero_float = 0x00000000u; // 0.0f
    const int32_t one_float  = 0x3f800000u; // 1.0f
    const int32_t tiny_float = 0x00800000u; // 1.17549435e-38f
    const int32_t huge_float = 0x7f7fffffu; // 3.40282347e+38f 

    ia = float_as_int32 (a);

    /* result for exceptional cases */
    iu = mux ((int32_t)ia < 0, indefinite, ia); // return QNaN INDEFINITE
    iu = mux ((ia & abs_mask) == 0, neg_infty, iu); // return -Inf
    iu = mux ((ia & abs_mask) > pos_infty, ia | qnan_bit, iu); // convert to QNaN

    /* result for non-exceptional cases */
    shift = clz (ia) - 8;
    it = (ia << shift) + ((23 - shift) << 23);
    ii = mux (ia < tiny_float, -23, 0);
    it = mux (ia < tiny_float, it, ia);

    /* split argument into exponent and mantissa parts */
    e = (it - 0x3f2aaaab) & 0xff800000;
    m = int32_as_float (it - e);
    i = fmaf ((float)e, 1.19209290e-7f, (float)ii);

    /* m in [2/3, 4/3] */
    f = m - 1.0f;
    s = f * f;

    /* Compute log1p(f) for f in [-1/3, 1/3] */
    r =             -0.130310059f; 
    t =              0.140869141f; 
    r = fmaf (r, s, -0.121483363f);
    t = fmaf (t, s,  0.139814854f);
    r = fmaf (r, s, -0.166846141f);
    t = fmaf (t, s,  0.200120345f);
    r = fmaf (r, s, -0.249996200f);
    r = fmaf (t, f, r);
    r = fmaf (r, f,  0.333331972f);
    r = fmaf (r, f, -0.500000000f);
    r = fmaf (r, s, f);
    r = fmaf (i, 0.693147182f, r); // log(2) 

    /* late selection between exceptional and non-exceptional result */
    excp = ((uint32_t)ia - 1) > ((uint32_t)huge_float - 1);
    iu = mux (excp, iu, zero_float);
    im = mux (excp, zero_float, one_float);
    r = fmaf (int32_as_float (im), r, int32_as_float (iu));

    return r;
}

/* reinterpret bit pattern of IEEE-754 binary64 as a 64-bit unsigned integer */ 
uint64_t double_as_uint64 (double a)
{
    uint64_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

double floatUlpErr (float res, double ref)
{
    uint64_t i, j, err;
    int expoRef;
    
    /* ulp error cannot be computed if either operand is NaN, infinity, zero */
    if (isnan (res) || isnan (ref) || isinf (res) || isinf (ref) ||
        (res == 0.0f) || (ref == 0.0f)) {
        return 0.0;
    }
    /* Convert the float result to an "extended float". This is like a float
       with 56 instead of 24 effective mantissa bits
    */
    i = ((uint64_t)(uint32_t)float_as_int32 (res)) << 32;
    /* Convert the double reference to an "extended float". If the reference is
       >= 2^129, we need to clamp yo the maximum "extended float". If reference
       is < 2^-126, we need to denormalize because of float's limited exponent
       range.
    */
    expoRef = (int)(((double_as_uint64(ref) >> 52) & 0x7ff) - 1023);
    if (expoRef >= 129) {
        j = (double_as_uint64(ref) & 0x8000000000000000ULL) |
            0x7fffffffffffffffULL;
    } else if (expoRef < -126) {
        j = ((double_as_uint64(ref) << 11) | 0x8000000000000000ULL) >> 8;
        j = j >> (-(expoRef + 126));
        j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
    } else {
        j = ((double_as_uint64(ref) << 11) & 0x7fffffffffffffffULL) >> 8;
        j = j | ((uint64_t)(expoRef + 127) << 55);
        j = j | (double_as_uint64(ref) & 0x8000000000000000ULL);
    }
    err = (i < j) ? (j - i) : (i - j);
    return err / 4294967296.0;
}

int main (void)
{
    uint32_t diff, refi, resi, argi = 0;
    float reff, res, arg;
    double ref, ulp, maxulp = 0;

    do {
        arg = int32_as_float (argi);
        ref = log ((double)arg);
        reff = (float)ref;
        res = my_logf (arg);
        ulp = floatUlpErr (res, ref);
        if (ulp > maxulp) maxulp = ulp;
        resi = float_as_int32 (res);
        refi = float_as_int32 (reff);
        diff = (resi > refi) ? (resi - refi) : (refi - resi);
        if (diff > 1) {
            printf ("error: arg=%15.6a res=%15.6a ref=%15.6a\n", arg, res, ref);
            return EXIT_FAILURE;
        }
        argi++;
    } while (argi);
    printf ("maximum ulp error: %.5f\n", maxulp);
    return EXIT_SUCCESS;
}

c++ - C++ 的侧通道抗性数学函数

1 回答 1

Related

Reference