0

将无符号 256 位整数乘以常数 977 的函数的两个版本。(用于密码学应用)__int128_t第二个函数中使用注释类型。

#include <stdint.h>
#include <stdio.h>
// z,z_carry = uint256(y) * 977
static inline void multiply_977(uint32_t *y, uint32_t *z) {
  const uint32_t x = 977;
  uint32_t high=0;
  uint64_t prod = x*(uint64_t)(y[0]); z[0] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[1])); z[1] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[2])); z[2] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[3])); z[3] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[4])); z[4] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[5])); z[5] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[6])); z[6] = prod;
  high=prod>>32; prod = high + (x*(uint64_t)(y[7])); z[7] = prod;
  high=prod>>32; z[8] = (uint32_t)high; z[9] = 0;
}
static inline void multiply_977_2(uint32_t *y_, uint32_t *z_) {
  const uint32_t x = 977;
  uint64_t *y = (uint64_t*)y_;
  uint64_t *z = (uint64_t*)z_;
  uint64_t high=0;
  __uint128_t prod = x * (__uint128_t)(y[0]);
  z[0] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[1]));
  z[1] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[2]));
  z[2] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[3]));
  z[3] = prod; high=prod>>64;
  z[4] = high&0x00000000ffffffff;
  // z_[8] = (uint64_t)high; z_[9] = 0;
}

int main(int argc, char** argv) {
  uint32_t a[10] = {0};
  uint32_t b[10] = {0};
  a[0] = 1;
  b[0] = 1;
  for (int i=0; i<100; ++i) {
    multiply_977(a, a);
    multiply_977_2(b, b);
  }
  for (int i=0; i<8; ++i) { printf("%08x ", a[i]); }
  printf("\n");
  for (int i=0; i<8; ++i) { printf("%08x ", b[i]); }
  printf("\n");
  return 0;
}

输出有和没有-O3,结果应该是相同的:

khaotik@KKST2:~/tmp$ gcc a.c; ./a.out
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 
khaotik@KKST2:~/tmp$ gcc -O3 a.c; ./a.out
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd 
00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000 

使用-O3, gcc 生成 SSE 代码:

00000000000010a0 <main>:
    10a0:   f3 0f 1e fa             endbr64 
    10a4:   41 54                   push   %r12
    10a6:   66 0f ef c0             pxor   %xmm0,%xmm0
    10aa:   b9 64 00 00 00          mov    $0x64,%ecx
    10af:   45 31 c9                xor    %r9d,%r9d
    10b2:   55                      push   %rbp
    10b3:   31 f6                   xor    %esi,%esi
    10b5:   31 ed                   xor    %ebp,%ebp
    10b7:   45 31 db                xor    %r11d,%r11d
    10ba:   53                      push   %rbx
    10bb:   45 31 c0                xor    %r8d,%r8d
    10be:   31 db                   xor    %ebx,%ebx
    10c0:   45 31 d2                xor    %r10d,%r10d
    10c3:   bf 01 00 00 00          mov    $0x1,%edi
    10c8:   48 83 ec 60             sub    $0x60,%rsp
    10cc:   64 48 8b 04 25 28 00    mov    %fs:0x28,%rax
    10d3:   00 00 
    10d5:   48 89 44 24 58          mov    %rax,0x58(%rsp)
    10da:   31 c0                   xor    %eax,%eax
    10dc:   0f 29 44 24 30          movaps %xmm0,0x30(%rsp)
    10e1:   c7 44 24 24 00 00 00    movl   $0x0,0x24(%rsp)
    10e8:   00 
    10e9:   48 c7 44 24 50 00 00    movq   $0x0,0x50(%rsp)
    10f0:   00 00 
    10f2:   c7 44 24 30 01 00 00    movl   $0x1,0x30(%rsp)
    10f9:   00 
    10fa:   0f 29 44 24 40          movaps %xmm0,0x40(%rsp)
    10ff:   90                      nop
    1100:   89 f8                   mov    %edi,%eax
    1102:   4d 69 c9 d1 03 00 00    imul   $0x3d1,%r9,%r9
    1109:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    1110:   89 c7                   mov    %eax,%edi
    1112:   48 c1 e8 20             shr    $0x20,%rax
    1116:   48 89 c2                mov    %rax,%rdx
    1119:   44 89 d0                mov    %r10d,%eax
    111c:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    1123:   48 01 d0                add    %rdx,%rax
    1126:   41 89 c2                mov    %eax,%r10d
    1129:   48 c1 e8 20             shr    $0x20,%rax
    112d:   48 89 c2                mov    %rax,%rdx
    1130:   44 89 c0                mov    %r8d,%eax
    1133:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    113a:   48 01 d0                add    %rdx,%rax
    113d:   41 89 c0                mov    %eax,%r8d
    1140:   48 c1 e8 20             shr    $0x20,%rax
    1144:   48 89 c2                mov    %rax,%rdx
    1147:   44 89 d8                mov    %r11d,%eax
    114a:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    1151:   48 01 d0                add    %rdx,%rax
    1154:   41 89 c3                mov    %eax,%r11d
    1157:   48 c1 e8 20             shr    $0x20,%rax
    115b:   48 89 c2                mov    %rax,%rdx
    115e:   89 f0                   mov    %esi,%eax
    1160:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    1167:   48 01 d0                add    %rdx,%rax
    116a:   89 c6                   mov    %eax,%esi
    116c:   48 c1 e8 20             shr    $0x20,%rax
    1170:   48 89 c2                mov    %rax,%rdx
    1173:   89 e8                   mov    %ebp,%eax
    1175:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    117c:   48 01 d0                add    %rdx,%rax
    117f:   89 c5                   mov    %eax,%ebp
    1181:   48 c1 e8 20             shr    $0x20,%rax
    1185:   48 89 c2                mov    %rax,%rdx
    1188:   89 d8                   mov    %ebx,%eax
    118a:   48 69 c0 d1 03 00 00    imul   $0x3d1,%rax,%rax
    1191:   48 01 d0                add    %rdx,%rax
    1194:   89 c3                   mov    %eax,%ebx
    1196:   48 c1 e8 20             shr    $0x20,%rax
    119a:   4c 01 c8                add    %r9,%rax
    119d:   41 89 c1                mov    %eax,%r9d
    11a0:   83 e9 01                sub    $0x1,%ecx
    11a3:   0f 85 57 ff ff ff       jne    1100 <main+0x60>
    11a9:   66 41 0f 6e c8          movd   %r8d,%xmm1
    11ae:   66 41 0f 6e d3          movd   %r11d,%xmm2
    11b3:   66 0f 6e c7             movd   %edi,%xmm0
    11b7:   48 c1 e8 20             shr    $0x20,%rax
    11bb:   66 41 0f 6e da          movd   %r10d,%xmm3
    11c0:   66 0f 62 ca             punpckldq %xmm2,%xmm1
    11c4:   66 0f 6e ed             movd   %ebp,%xmm5
    11c8:   89 44 24 20             mov    %eax,0x20(%rsp)
    11cc:   66 0f 62 c3             punpckldq %xmm3,%xmm0
    11d0:   66 41 0f 6e e1          movd   %r9d,%xmm4
    11d5:   4c 8d 64 24 20          lea    0x20(%rsp),%r12
    11da:   66 0f 6c c1             punpcklqdq %xmm1,%xmm0
    11de:   66 0f 6e cb             movd   %ebx,%xmm1
    11e2:   48 8d 2d 1b 0e 00 00    lea    0xe1b(%rip),%rbp        # 2004 <_IO_stdin_used+0x4>
    11e9:   48 89 e3                mov    %rsp,%rbx
    11ec:   0f 29 04 24             movaps %xmm0,(%rsp)
    11f0:   66 0f 6e c6             movd   %esi,%xmm0
    11f4:   66 0f 62 cc             punpckldq %xmm4,%xmm1
    11f8:   66 0f 62 c5             punpckldq %xmm5,%xmm0
    11fc:   66 0f 6c c1             punpcklqdq %xmm1,%xmm0
    1200:   0f 29 44 24 10          movaps %xmm0,0x10(%rsp)
    1205:   0f 1f 00                nopl   (%rax)
    1208:   8b 13                   mov    (%rbx),%edx
    120a:   48 89 ee                mov    %rbp,%rsi
    120d:   bf 01 00 00 00          mov    $0x1,%edi
    1212:   31 c0                   xor    %eax,%eax
    1214:   48 83 c3 04             add    $0x4,%rbx
    1218:   e8 73 fe ff ff          callq  1090 <__printf_chk@plt>
    121d:   49 39 dc                cmp    %rbx,%r12
    1220:   75 e6                   jne    1208 <main+0x168>
    1222:   bf 0a 00 00 00          mov    $0xa,%edi
    1227:   48 8d 5c 24 30          lea    0x30(%rsp),%rbx
    122c:   4c 8d 64 24 50          lea    0x50(%rsp),%r12
    1231:   e8 3a fe ff ff          callq  1070 <putchar@plt>
    1236:   48 8d 2d c7 0d 00 00    lea    0xdc7(%rip),%rbp        # 2004 <_IO_stdin_used+0x4>
    123d:   0f 1f 00                nopl   (%rax)
    1240:   8b 13                   mov    (%rbx),%edx
    1242:   48 89 ee                mov    %rbp,%rsi
    1245:   bf 01 00 00 00          mov    $0x1,%edi
    124a:   31 c0                   xor    %eax,%eax
    124c:   48 83 c3 04             add    $0x4,%rbx
    1250:   e8 3b fe ff ff          callq  1090 <__printf_chk@plt>
    1255:   49 39 dc                cmp    %rbx,%r12
    1258:   75 e6                   jne    1240 <main+0x1a0>
    125a:   bf 0a 00 00 00          mov    $0xa,%edi
    125f:   e8 0c fe ff ff          callq  1070 <putchar@plt>
    1264:   48 8b 44 24 58          mov    0x58(%rsp),%rax
    1269:   64 48 33 04 25 28 00    xor    %fs:0x28,%rax
    1270:   00 00 
    1272:   75 0b                   jne    127f <main+0x1df>
    1274:   48 83 c4 60             add    $0x60,%rsp
    1278:   31 c0                   xor    %eax,%eax
    127a:   5b                      pop    %rbx
    127b:   5d                      pop    %rbp
    127c:   41 5c                   pop    %r12
    127e:   c3                      retq   
    127f:   e8 fc fd ff ff          callq  1080 <__stack_chk_fail@plt>
    1284:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
    128b:   00 00 00 
    128e:   66 90                   xchg   %ax,%ax

但是,我对 asm 的了解还不够,无法找出问题所在。

编译器版本和 cpu 标志:

khaotik@KKST2:~/tmp$ gcc --version
gcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

khaotik@KKST2:~/tmp$ lscpu | grep Flags
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d
4

0 回答 0