我提出的解决方案在“快速路径”上执行 6 次或 8 次 FMA 操作,在“快速路径”上执行 6 次或 8 次 FMA 操作中专门针对此特定场景(股息是 2 的幂)的长除法,然后在“慢路”。
以下程序对建议的解决方案执行详尽的测试(在支持 FMA 的 CPU 上需要大约 1-2 分钟)。
#include <math.h>
#include <stdint.h>
#include <stdio.h>
struct quomod {
unsigned long quo;
unsigned long mod;
};
// Divide 1 << (32 + SHIFT) by DIV, return quotient and modulus
struct quomod
quomod_ref(unsigned div, unsigned shift)
{
uint64_t t = 1ull << (32 + shift);
return (struct quomod){t / div, t % div};
}
// Reinterpret given bits as float
static inline float int_as_float(uint32_t bits)
{
return (union{ unsigned b; float f; }){bits}.f;
}
// F contains integral value in range [-2**32 .. 2**32]. Convert it to integer,
// with wrap-around on overflow. If the GPU implements saturating conversion,
// it also may be used
static inline uint32_t cvt_f32_u32_wrap(float f)
{
return (uint32_t)(long long)f;
}
struct quomod
quomod_alt(unsigned div, unsigned shift)
{
// t = float(1ull << (32 + shift))
float t = int_as_float(0x4f800000 + (shift << 23));
// mask with max(0, shift - 23) low bits zero
uint32_t mask = (int)(~0u << shift) >> 23;
// No roundoff in conversion
float div_f = div & mask;
// Caution: on the CPU this is correctly rounded, but on the GPU
// native reciprocal may be off by a few ULP, in which case a
// refinement step may be necessary:
// recip = fmaf(fmaf(recip, -div_f, 1), recip, recip)
float recip = 1.f / div_f;
// Higher part of the quotient, integer in range 2^31 .. 2^32
float quo_hi = t * recip;
// No roundoff
float res = fmaf(quo_hi, -div_f, t);
float quo_lo_approx = res * recip;
float res2 = fmaf(quo_lo_approx, -div_f, res);
// Lower part of the quotient, may be negative
float quo_lo = floorf(fmaf(res2, recip, quo_lo_approx));
// Remaining part of the dividend
float mod_f = fmaf(quo_lo, -div_f, res);
// Quotient as sum of parts
unsigned quo = cvt_f32_u32_wrap(quo_hi) + (int)quo_lo;
// Adjust quotient down if remainder is negative
if (mod_f < 0) {
quo--;
}
if (div & ~mask) {
// The quotient was computed for a truncated divisor, so
// it matches or exceeds the true result
// High part of the dividend
uint32_t ref_hi = 1u << shift;
// Unless quotient is zero after wraparound, increment it so
// it's higher than true quotient (its high bit must be 1)
quo -= (int)quo >> 31;
// Binary search for the true quotient; search invariant:
// quo is higher than true quotient, quo-2*bit is lower
for (unsigned bit = 256; bit; bit >>= 1) {
unsigned try = quo - bit;
// One multiply-high instruction
uint32_t prod_hi = 1ull * try * div >> 32;
if (prod_hi >= ref_hi)
quo = try;
}
// quo is zero or exceeds the true quotient, so quo-1 must be it
quo--;
}
// Use the "left-pointing short magic wand" operator
// to recover the remainder
return (struct quomod){quo, quo *- div};
}
int main()
{
fprintf(stderr, "%66c\r[", ']');
unsigned step = 1;
for (unsigned div = 3; div; div += step) {
// Progress bar
if (!(div & 0x03ffffff)) fprintf(stderr, "=");
// Skip powers of two
if (!(div & (div-1))) continue;
unsigned shift = 31 - __builtin_clz(div);
struct quomod ref = quomod_ref(div, shift);
struct quomod alt = quomod_alt(div, shift);
if (ref.quo != alt.quo || ref.mod != alt.mod) {
printf("\nerror at %u\n", div);
return 1;
}
}
fprintf(stderr, "=\nAll ok\n");
return 0;
}