c++ - 在 C++ 中进行基本 128 位整数计算的有效方法？

Question

几年前，我需要一种方法来使用 Cuda 进行一些基本的 128 位整数数学运算： cuda 上的 128 位整数？. 现在我遇到了同样的问题，但是这次我需要在不支持任何类型的 128 位的 32 位嵌入式系统（英特尔爱迪生）上运行一些基本的 128 位算术（求和、位移和乘法）。但是，直接支持 64 位整数（unsigned long long int）。

我天真地尝试使用上次在 CPU 上回答我的 asm 代码，但我得到了一堆错误。我真的没有使用 asm 的经验，所以：用 64 位整数实现 128 位的加法、乘法和位移的最有效方法是什么？

score 10 · Accepted Answer

更新：由于 OP 尚未接受答案 <hint><hint>，我附上了更多代码。

使用上面讨论的库可能是一个好主意。虽然您今天可能只需要几个功能，但最终您可能会发现您还需要一个。在那之后还有一个。直到最终你最终编写、调试和维护你自己的 128 位数学库。这是浪费你的时间和精力。

那就是说。如果你决定自己动手：

1）您之前提出的 cuda 问题已经有用于乘法的 c 代码。有什么问题吗？

2）这种转变可能不会从使用 asm 中受益，所以 ac 解决方案在这里对我来说也很有意义。 ~~虽然如果性能在这里真的是一个问题，我会看看爱迪生是否支持 SHLD/SHRD，这可能会让这更快一点。否则，m~~也许是这样的方法？

my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
   my_uint128_t res;
   if (b < 32) {    
      res.x = a.x << b;
      res.y = (a.y << b) | (a.x >> (32 - b));
      res.z = (a.z << b) | (a.y >> (32 - b));
      res.w = (a.w << b) | (a.z >> (32 - b));
   } elseif (b < 64) {
      ...
   }

   return res;
}

更新：由于 Edison 似乎支持 SHLD/SHRD，这里有一个替代方案，它可能比上面的“c”代码性能更高。与所有声称更快的代码一样，您应该对其进行测试。

inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) &&
       __builtin_constant_p(from) &&
       __builtin_constant_p(c))
   {
      res = (into << c) | (from >> (32 - c));
   }
   else
   {
      asm("shld %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) && 
       __builtin_constant_p(from) && 
       __builtin_constant_p(c))
   {
      res = (into >> c) | (from << (32 - c));
   }
   else
   {
      asm("shrd %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = a.x << b;
      res.y = __shld(a.y, a.x, b);
      res.z = __shld(a.z, a.y, b);
      res.w = __shld(a.w, a.z, b);
   } else if (b < 64) {
      res.x = 0;
      res.y = a.x << (b - 32);
      res.z = __shld(a.y, a.x, b - 32);
      res.w = __shld(a.z, a.y, b - 32);
   } else if (b < 96) {
      res.x = 0;
      res.y = 0;
      res.z = a.x << (b - 64);
      res.w = __shld(a.y, a.x, b - 64);
   } else if (b < 128) {
      res.x = 0;
      res.y = 0;
      res.z = 0;
      res.w = a.x << (b - 96);
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = __shrd(a.x, a.y, b);
      res.y = __shrd(a.y, a.z, b);
      res.z = __shrd(a.z, a.w, b);
      res.w = a.w >> b;
   } else if (b < 64) {
      res.x = __shrd(a.y, a.z, b - 32);
      res.y = __shrd(a.z, a.w, b - 32);
      res.z = a.w >> (b - 32);
      res.w = 0;
   } else if (b < 96) {
      res.x = __shrd(a.z, a.w, b - 64);
      res.y = a.w >> (b - 64);
      res.z = 0;
      res.w = 0;
   } else if (b < 128) {
      res.x = a.w >> (b - 96);
      res.y = 0;
      res.z = 0;
      res.w = 0;
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

3）添加可能受益于asm。你可以试试这个：

struct my_uint128_t
{
   unsigned int x;
   unsigned int y;
   unsigned int z;
   unsigned int w;
};

my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
   my_uint128_t res;

    asm ("addl %5, %[resx]\n\t"
         "adcl %7, %[resy]\n\t"
         "adcl %9, %[resz]\n\t"
         "adcl %11, %[resw]\n\t"
         : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
           [resz] "=&r" (res.z), [resw] "=&r" (res.w)
         : "%0"(a.x), "irm"(b.x), 
           "%1"(a.y), "irm"(b.y), 
           "%2"(a.z), "irm"(b.z), 
           "%3"(a.w), "irm"(b.w)
         : "cc");

   return res;
}

我只是冲了这一点，所以使用风险自负。我没有爱迪生，但这适用于 x86。

更新：如果你只是在做积累（to += from而不是上面的代码c = a + b），这段代码可能会更好地为你服务：

inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
   asm ("addl %[fromx], %[tox]\n\t"
        "adcl %[fromy], %[toy]\n\t"
        "adcl %[fromz], %[toz]\n\t"
        "adcl %[fromw], %[tow]\n\t"
        : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
          [toz] "+&r"(to->z), [tow] "+&r"(to->w)
        : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
          [fromz] "irm"(from.z), [fromw] "irm"(from.w)
        : "cc");
}

score 5 · Accepted Answer

如果使用外部库是一个选项，那么看看这个问题。您可以使用TTMath，这是一个非常简单的大精度数学标头。在 32 位架构ttmath:UInt<4>上，将创建一个具有四个 32 位分支的 128 位int类型。Boost.Multiprecision或 calccrypto/uint128_t(u)int128_t中还有其他一些替代方案

如果您必须自己编写，那么已经有很多关于 SO 的解决方案，我将在这里总结它们

对于加法和减法，它非常简单明了，只需将单词（大型 int 库通常称为肢体）从最低有效到更高有效添加/减去，当然还有进位。

typedef struct INT128 {
    uint64_t H, L;
} my_uint128_t;

inline my_uint128_t add(my_uint128_t a, my_uint128_t b)
{
    my_uint128_t c;
    c.L = a.L + b.L;
    c.H = a.H + b.H + (c.L < a.L);  // c = a + b
    return c;
}

可以使用Compiler Explorer检查汇编输出

编译器已经可以为双字操作生成高效的代码，但是在从高级语言编译多字操作时，许多编译器不够聪明，无法使用“带进位加法”，正如您在问题高效 128 位加法中看到的那样，使用携带旗帜。因此，long long像上面那样使用 2 不仅会使它更具可读性，而且编译器也更容易发出更高效的代码。

如果这仍然不符合您的性能要求，则必须使用内在函数或将其编写在汇编中。要将存储bignum的 128 位值添加到 {eax, ebx, ecx, edx} 中的 128 位值中，您可以使用以下代码

add edx, [bignum]
adc ecx, [bignum+4]
adc ebx, [bignum+8]
adc eax, [bignum+12]

对于 Clang ，等效的内在函数将是这样的

unsigned *x, *y, *z, carryin=0, carryout;
z[0] = __builtin_addc(x[0], y[0], carryin, &carryout);
carryin = carryout;
z[1] = __builtin_addc(x[1], y[1], carryin, &carryout);
carryin = carryout;
z[2] = __builtin_addc(x[2], y[2], carryin, &carryout);
carryin = carryout;
z[3] = __builtin_addc(x[3], y[3], carryin, &carryout);

您需要将内部函数更改为编译器支持的函数，例如__builtin_uadd_overflowgcc或MSVC和ICC_addcarry_u32

有关更多信息，请阅读这些

对于位移，您可以在问题Bitwise shift operation on a 128-bit number中找到 C 解决方案。这是一个简单的左移，但您可以展开递归调用以获得更高的性能

void shiftl128 (
    unsigned int& a,
    unsigned int& b,
    unsigned int& c,
    unsigned int& d,
    size_t k)
{
    assert (k <= 128);
    if (k >= 32) // shifting a 32-bit integer by more than 31 bits is "undefined"
    {
        a=b;
        b=c;
        c=d;
        d=0;
        shiftl128(a,b,c,d,k-32);
    }
    else
    {
        a = (a << k) | (b >> (32-k));
        b = (b << k) | (c >> (32-k));
        c = (c << k) | (d >> (32-k));
        d = (d << k);
    }
}

小于 32 位移位的汇编可以在问题128 位移位使用汇编语言？

shld    edx, ecx, cl
shld    ecx, ebx, cl
shld    ebx, eax, cl
shl     eax, cl

右移可以类似地实现，或者只是从上面链接的问题中复制

乘法和除法要复杂得多，您可以参考问题Efficient Multiply/Divide of two 128-bit Integers on x86 (no 64-bit)中的解决方案：

class int128_t
{
    uint32_t dw3, dw2, dw1, dw0;

    // Various constrctors, operators, etc...

    int128_t& operator*=(const int128_t&  rhs) __attribute__((always_inline))
    {
        int128_t Urhs(rhs);
        uint32_t lhs_xor_mask = (int32_t(dw3) >> 31);
        uint32_t rhs_xor_mask = (int32_t(Urhs.dw3) >> 31);
        uint32_t result_xor_mask = (lhs_xor_mask ^ rhs_xor_mask);
        dw0 ^= lhs_xor_mask;
        dw1 ^= lhs_xor_mask;
        dw2 ^= lhs_xor_mask;
        dw3 ^= lhs_xor_mask;
        Urhs.dw0 ^= rhs_xor_mask;
        Urhs.dw1 ^= rhs_xor_mask;
        Urhs.dw2 ^= rhs_xor_mask;
        Urhs.dw3 ^= rhs_xor_mask;
        *this += (lhs_xor_mask & 1);
        Urhs += (rhs_xor_mask & 1);

        struct mul128_t
        {
            int128_t dqw1, dqw0;
            mul128_t(const int128_t& dqw1, const int128_t& dqw0): dqw1(dqw1), dqw0(dqw0){}
        };

        mul128_t data(Urhs,*this);
        asm volatile(
        "push      %%ebp                            \n\
        movl       %%eax,   %%ebp                   \n\
        movl       $0x00,   %%ebx                   \n\
        movl       $0x00,   %%ecx                   \n\
        movl       $0x00,   %%esi                   \n\
        movl       $0x00,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%ebx                   \n\
        adcl       %%edx,   %%ecx                   \n\
        adcl       $0x00,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   24(%%ebp),   %%eax #Calc: (dw1*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%ecx                   \n\
        adcl       %%edx,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   20(%%ebp),   %%eax #Calc: (dw2*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   16(%%ebp),   %%eax #Calc: (dw3*dw0)  \n\
        mull             12(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%ecx                   \n\
        adcl       %%edx,   %%esi                   \n\
        adcl       $0x00,   %%edi                   \n\
        movl   24(%%ebp),   %%eax #Calc: (dw1*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   20(%%ebp),   %%eax #Calc: (dw2*dw1)  \n\
        mull              8(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw2)  \n\
        mull              4(%%ebp)                  \n\
        addl       %%eax,   %%esi                   \n\
        adcl       %%edx,   %%edi                   \n\
        movl   24(%%ebp),  %%eax #Calc: (dw1*dw2)   \n\
        mull              4(%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        movl   28(%%ebp),   %%eax #Calc: (dw0*dw3)  \n\
        mull               (%%ebp)                  \n\
        addl       %%eax,   %%edi                   \n\
        pop        %%ebp                            \n"
        :"=b"(this->dw0),"=c"(this->dw1),"=S"(this->dw2),"=D"(this->dw3)
        :"a"(&data):"%ebp");

        dw0 ^= result_xor_mask;
        dw1 ^= result_xor_mask;
        dw2 ^= result_xor_mask;
        dw3 ^= result_xor_mask;
        return (*this += (result_xor_mask & 1));
    }
};

也可以找到很多128bit标签的相关问题

c++ - 在 C++ 中进行基本 128 位整数计算的有效方法？

2 回答 2

Related

Reference