我不知道任何适用于具有恒定股息的整数除法的优化。为了仔细检查,我用Compiler Explorer尝试了一个全除数的测试用例。使用指定了最高优化级别的 gcc、icc 和 clang,生成的代码显示没有对除法应用任何优化。
当然可以构建高性能的 128 位除法例程,但根据个人经验,我知道这很容易出错,并且需要非常复杂的测试来实现良好的测试覆盖率,包括极端情况,因为详尽的测试在这个操作数大小。设计和测试的努力很容易超过 Stackoverflow 上的答案似乎合理的两个数量级。
执行整数除法的一种简单方法是使用我们在小学时都学过的算法,只是二进制。这使得关于下一个商位的决定特别容易:当当前部分余数大于或等于除数时为 1,否则为 0。使用普通的二进制除法,我们唯一需要的整数运算是加法和减法。
我们可以通过模仿处理器的机器指令对多字整数进行操作的方式来构建可移植原语,以便在任何位长的操作数上执行这些操作:带进位的加法、带进位的加法、带进位的加法和执行;类似于 SUB。在下面的代码中,我为此使用了简单的 C 宏;当然更复杂的方法是可能的。
由于我现在正在使用的系统不支持 128 位整数,因此我为 64 位整数制作了原型并测试了这种方法。当时的 128 位版本是简单机械重命名的练习。在现代 64 位处理器上,我希望这个 128 位除法函数在大约 3000 个周期内执行。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>
#define SUBCcc(a,b,cy,t0,t1,t2) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t1-t0)
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
typedef struct {
uint64_t l;
uint64_t h;
} my_uint128;
my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
my_uint128 quot, rem, tmp;
uint64_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (my_uint128);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
typedef struct {
uint32_t l;
uint32_t h;
} my_uint64;
my_uint64 bitwise_division_64 (my_uint64 dvnd, my_uint64 dvsr)
{
my_uint64 quot, rem, tmp;
uint32_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (my_uint64);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
uint64_t a, b, res, ref;
my_uint64 aa, bb, rr;
do {
a = KISS64;
b = KISS64;
ref = a / b;
aa.l = (uint32_t)a;
aa.h = (uint32_t)(a >> 32);
bb.l = (uint32_t)b;
bb.h = (uint32_t)(b >> 32);
rr = bitwise_division_64 (aa, bb);
res = (((uint64_t)rr.h) << 32) + rr.l;
if (ref != res) {
printf ("a=%016llx b=%016llx res=%016llx ref=%016llx\n", a, b, res, ref);
return EXIT_FAILURE;
}
} while (a);
return EXIT_SUCCESS;
}
比逐位计算更快的方法是计算除数的倒数,乘以被除数得到初步商,然后计算余数以精确调整商。整个计算可以在定点算术中完成。但是,在具有快速浮点单元的现代处理器上,使用双精度除法生成倒数的起始近似值会更方便。具有三次收敛的单次哈雷迭代会产生全精度倒数。
倒数的哈雷迭代是非常密集的整数乘法,64x64 位乘法与 128 位结果(umul64wide()
在下面的代码中)是对性能至关重要的构建块。在现代 64 位架构上,这通常是在几个周期内执行的单个机器指令,但是可移植代码无法访问。模拟指令的可移植代码需要大约 15 到 20 条指令,具体取决于架构和编译器。
整个 128 位除法大约需要 300 个周期,或者是简单按位计算的十倍。由于代码相当复杂,因此需要进行大量测试以确保正确运行。在下面的框架中,我使用基于模式的随机测试来进行中等强度的测试,并使用直接的按位实现作为参考。
下面的实现udiv128()
假设编程环境使用符合 IEEE-754 的浮点运算,double
类型映射到 IEEE-754 的binary64
类型,并且double
操作数的除法正确舍入。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
typedef struct {
uint64_t l;
uint64_t h;
} my_uint128;
my_uint128 make_my_uint128 (uint64_t h, uint64_t l);
my_uint128 add128 (my_uint128 a, my_uint128 b);
my_uint128 sub128 (my_uint128 a, my_uint128 b);
my_uint128 lsl128 (my_uint128 a, int sh);
my_uint128 lsr128 (my_uint128 a, int sh);
my_uint128 not128 (my_uint128 a);
my_uint128 umul128lo (my_uint128 a, my_uint128 b);
my_uint128 umul128hi (my_uint128 a, my_uint128 b);
double my_uint128_to_double (my_uint128 a);
int lt128 (my_uint128 a, my_uint128 b);
int eq128 (my_uint128 a, my_uint128 b);
uint64_t double_as_uint64 (double a);
double uint64_as_double (uint64_t a);
#define FP64_EXPO_BIAS (1023)
#define FP64_MANT_BITS (53)
#define FP64_MANT_IBIT (0x0010000000000000ULL)
#define FP64_MANT_MASK (0x000fffffffffffffULL)
#define FP64_INC_EXP_128 (0x0800000000000000ULL)
#define FP64_MANT_ADJ (2) // adjustment to ensure underestimate
my_uint128 udiv128 (my_uint128 dividend, my_uint128 divisor)
{
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
const my_uint128 two = make_my_uint128 (0ULL, 2ULL);
my_uint128 recip, temp, quo, rem;
my_uint128 neg_divisor = sub128 (zero, divisor);
double r;
/* compute initial approximation for reciprocal; must be underestimate! */
r = 1.0 / my_uint128_to_double (divisor);
uint64_t i = double_as_uint64 (r) - FP64_MANT_ADJ + FP64_INC_EXP_128;
temp = make_my_uint128 (0ULL, (i & FP64_MANT_MASK) | FP64_MANT_IBIT);
int sh = (i >> (FP64_MANT_BITS-1)) - FP64_EXPO_BIAS - (FP64_MANT_BITS-1);
recip = (sh < 0) ? lsr128 (temp, -sh) : lsl128 (temp, sh);
/* perform Halley iteration with cubic convergence to refine reciprocal */
temp = umul128lo (neg_divisor, recip);
temp = add128 (umul128hi (temp, temp), temp);
recip = add128 (umul128hi (recip, temp), recip);
/* compute preliminary quotient and remainder */
quo = umul128hi (dividend, recip);
rem = sub128 (dividend, umul128lo (divisor, quo));
/* adjust quotient if too small; quotient off by 2 at most */
if (! lt128 (rem, divisor)) {
quo = add128 (quo, lt128 (sub128 (rem, divisor), divisor) ? one : two);
}
/* handle division by zero */
if (eq128 (divisor, zero)) quo = not128 (zero);
return quo;
}
#define SUBCcc(a,b,cy,t0,t1,t2) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t2=t1<t0, cy=cy+t2, t1-t0)
#define SUBcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), cy=t1<t0, t1-t0)
#define SUBC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t1-t0)
#define ADDCcc(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), cy=t0<cy, t0=t0+t1, t1=t0<t1, cy=cy+t1, t0=t0)
#define ADDcc(a,b,cy,t0,t1) \
(t0=(b), t1=(a), t0=t0+t1, cy=t0<t1, t0=t0)
#define ADDC(a,b,cy,t0,t1) \
(t0=(b)+cy, t1=(a), t0+t1)
uint64_t double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}
double uint64_as_double (uint64_t a)
{
double r;
memcpy (&r, &a, sizeof r);
return r;
}
my_uint128 add128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1;
a.l = ADDcc (a.l, b.l, cy, t0, t1);
a.h = ADDC (a.h, b.h, cy, t0, t1);
return a;
}
my_uint128 sub128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1;
a.l = SUBcc (a.l, b.l, cy, t0, t1);
a.h = SUBC (a.h, b.h, cy, t0, t1);
return a;
}
my_uint128 lsl128 (my_uint128 a, int sh)
{
if (sh >= 64) {
a.h = a.l << (sh - 64);
a.l = 0ULL;
} else if (sh) {
a.h = (a.h << sh) + (a.l >> (64 - sh));
a.l = a.l << sh;
}
return a;
}
my_uint128 lsr128 (my_uint128 a, int sh)
{
if (sh >= 64) {
a.l = a.h >> (sh - 64);
a.h = 0ULL;
} else if (sh) {
a.l = (a.l >> sh) + (a.h << (64 - sh));
a.h = a.h >> sh;
}
return a;
}
my_uint128 not128 (my_uint128 a)
{
a.l = ~a.l;
a.h = ~a.h;
return a;
}
int lt128 (my_uint128 a, my_uint128 b)
{
uint64_t cy, t0, t1, t2;
a.l = SUBcc (a.l, b.l, cy, t0, t1);
a.h = SUBCcc (a.h, b.h, cy, t0, t1, t2);
return cy;
}
int eq128 (my_uint128 a, my_uint128 b)
{
return (a.l == b.l) && (a.h == b.h);
}
// derived from Hacker's Delight 2nd ed. figure 8-2
my_uint128 umul64wide (uint64_t u, uint64_t v)
{
my_uint128 r;
uint64_t u0, v0, u1, v1, w0, w1, w2, t;
u0 = (uint32_t)u; u1 = u >> 32;
v0 = (uint32_t)v; v1 = v >> 32;
w0 = u0 * v0;
t = u1 * v0 + (w0 >> 32);
w1 = (uint32_t)t;
w2 = t >> 32;
w1 = u0 * v1 + w1;
r.h = u1 * v1 + w2 + (w1 >> 32);
r.l = (w1 << 32) + (uint32_t)w0;
return r;
}
my_uint128 make_my_uint128 (uint64_t h, uint64_t l)
{
my_uint128 r;
r.h = h;
r.l = l;
return r;
}
my_uint128 umul128lo (my_uint128 a, my_uint128 b)
{
my_uint128 r;
r = umul64wide (a.l, b.l);
r.h = r.h + a.l * b.h + a.h * b.l;
return r;
}
my_uint128 umul128hi (my_uint128 a, my_uint128 b)
{
my_uint128 t0, t1, t2, t3;
t0 = umul64wide (a.l, b.l);
t3 = add128 (umul64wide (a.h, b.l), make_my_uint128 (0ULL, t0.h));
t1 = make_my_uint128 (0ULL, t3.l);
t2 = make_my_uint128 (0ULL, t3.h);
t1 = add128 (umul64wide (a.l, b.h), t1);
return add128 (add128 (umul64wide (a.h, b.h), t2), make_my_uint128 (0ULL, t1.h));
}
double my_uint128_to_double (my_uint128 a)
{
const int intbits = sizeof (a) * CHAR_BIT;
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
my_uint128 rnd, i = a;
uint64_t j;
int sh = 0;
double r;
// normalize integer so MSB is set
if (lt128 (i, make_my_uint128(0x0000000000000001ULL, 0))) {i = lsl128 (i,64); sh += 64; }
if (lt128 (i, make_my_uint128(0x0000000100000000ULL, 0))) {i = lsl128 (i,32); sh += 32; }
if (lt128 (i, make_my_uint128(0x0001000000000000ULL, 0))) {i = lsl128 (i,16); sh += 16; }
if (lt128 (i, make_my_uint128(0x0100000000000000ULL, 0))) {i = lsl128 (i, 8); sh += 8; }
if (lt128 (i, make_my_uint128(0x1000000000000000ULL, 0))) {i = lsl128 (i, 4); sh += 4; }
if (lt128 (i, make_my_uint128(0x4000000000000000ULL, 0))) {i = lsl128 (i, 2); sh += 2; }
if (lt128 (i, make_my_uint128(0x8000000000000000ULL, 0))) {i = lsl128 (i, 1); sh += 1; }
// form mantissa with explicit integer bit
rnd = lsl128 (i, FP64_MANT_BITS);
i = lsr128 (i, intbits - FP64_MANT_BITS);
j = i.l;
// add in exponent, taking into account integer bit of mantissa
if (! eq128 (a, zero)) {
j += (uint64_t)(FP64_EXPO_BIAS + (intbits-1) - 1 - sh) << (FP64_MANT_BITS-1);
}
// round to nearest or even
rnd.h = rnd.h | (rnd.l != 0);
if ((rnd.h > 0x8000000000000000ULL) ||
((rnd.h == 0x8000000000000000ULL) && (j & 1))) j++;
// reinterpret bit pattern as IEEE-754 'binary64'
r = uint64_as_double (j);
return r;
}
my_uint128 bitwise_division_128 (my_uint128 dvnd, my_uint128 dvsr)
{
my_uint128 quot, rem, tmp;
uint64_t cy, t0, t1, t2;
int bits_left = CHAR_BIT * sizeof (dvsr);
quot.h = dvnd.h;
quot.l = dvnd.l;
rem.h = 0;
rem.l = 0;
do {
quot.l = ADDcc (quot.l, quot.l, cy, t0, t1);
quot.h = ADDCcc (quot.h, quot.h, cy, t0, t1);
rem.l = ADDCcc (rem.l, rem.l, cy, t0, t1);
rem.h = ADDC (rem.h, rem.h, cy, t0, t1);
tmp.l = SUBcc (rem.l, dvsr.l, cy, t0, t1);
tmp.h = SUBCcc (rem.h, dvsr.h, cy, t0, t1, t2);
if (!cy) { // remainder >= divisor
rem.l = tmp.l;
rem.h = tmp.h;
quot.l = quot.l | 1;
}
bits_left--;
} while (bits_left);
return quot;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
my_uint128 v[100000]; /* FIXME: size appropriately */
int main (void)
{
const my_uint128 zero = make_my_uint128 (0ULL, 0ULL);
const my_uint128 one = make_my_uint128 (0ULL, 1ULL);
my_uint128 dividend, divisor, quot, ref;
int i, j, patterns, idx = 0, nbrBits = sizeof (v[0]) * CHAR_BIT;
int patterns_done = 0;
/* pattern class 1: 2**i */
for (i = 0; i < nbrBits; i++) {
v [idx] = lsl128 (one, i);
idx++;
}
/* pattern class 2: 2**i-1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = sub128 (lsl128 (one, i), one);
idx++;
}
/* pattern class 3: 2**i+1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = add128 (lsl128 (one, i), one);
idx++;
}
/* pattern class 4: 2**i + 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = add128 (lsl128 (one, i), lsl128 (one, j));
idx++;
}
}
/* pattern class 5: 2**i - 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = sub128 (lsl128 (one, i), lsl128 (one, j));
idx++;
}
}
patterns = idx;
/* pattern class 6: one's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = not128 (v [i]);
idx++;
}
/* pattern class 7: two's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = sub128 (zero, v[i]);
idx++;
}
patterns = idx;
printf ("Starting pattern-based tests. Number of patterns: %d\n", patterns);
for (long long int k = 0; k < 100000000000LL; k++) {
if (k < patterns * patterns) {
dividend = v [k / patterns];
divisor = v [k % patterns];
} else {
if (!patterns_done) {
printf ("Starting random tests\n");
patterns_done = 1;
}
dividend.l = KISS64;
dividend.h = KISS64;
divisor.h = KISS64;
divisor.l = KISS64;
}
/* exclude cases with undefined results: division by zero */
if (! eq128 (divisor, zero)) {
quot = udiv128 (dividend, divisor);
ref = bitwise_division_128 (dividend, divisor);
if (! eq128 (quot, ref)) {
printf ("@ (%016llx_%016llx, %016llx_%016llx): quot = %016llx_%016llx ref=%016llx_%016llx\n",
dividend.h, dividend.l, divisor.h, divisor.l,
quot.h, quot.l, ref.h, ref.l);
return EXIT_FAILURE;
}
}
}
printf ("unsigned 128-bit division: tests passed\n");
return EXIT_SUCCESS;
}