将无符号 256 位整数乘以常数 977 的函数的两个版本。(用于密码学应用)__int128_t
第二个函数中使用注释类型。
#include <stdint.h>
#include <stdio.h>
// z,z_carry = uint256(y) * 977
static inline void multiply_977(uint32_t *y, uint32_t *z) {
const uint32_t x = 977;
uint32_t high=0;
uint64_t prod = x*(uint64_t)(y[0]); z[0] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[1])); z[1] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[2])); z[2] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[3])); z[3] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[4])); z[4] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[5])); z[5] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[6])); z[6] = prod;
high=prod>>32; prod = high + (x*(uint64_t)(y[7])); z[7] = prod;
high=prod>>32; z[8] = (uint32_t)high; z[9] = 0;
}
static inline void multiply_977_2(uint32_t *y_, uint32_t *z_) {
const uint32_t x = 977;
uint64_t *y = (uint64_t*)y_;
uint64_t *z = (uint64_t*)z_;
uint64_t high=0;
__uint128_t prod = x * (__uint128_t)(y[0]);
z[0] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[1]));
z[1] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[2]));
z[2] = prod; high=prod>>64; prod = high + (x * (__uint128_t)(y[3]));
z[3] = prod; high=prod>>64;
z[4] = high&0x00000000ffffffff;
// z_[8] = (uint64_t)high; z_[9] = 0;
}
int main(int argc, char** argv) {
uint32_t a[10] = {0};
uint32_t b[10] = {0};
a[0] = 1;
b[0] = 1;
for (int i=0; i<100; ++i) {
multiply_977(a, a);
multiply_977_2(b, b);
}
for (int i=0; i<8; ++i) { printf("%08x ", a[i]); }
printf("\n");
for (int i=0; i<8; ++i) { printf("%08x ", b[i]); }
printf("\n");
return 0;
}
输出有和没有-O3
,结果应该是相同的:
khaotik@KKST2:~/tmp$ gcc a.c; ./a.out
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd
khaotik@KKST2:~/tmp$ gcc -O3 a.c; ./a.out
2372c341 af466dcc 57f3a318 7ce73fd9 cd8f973d 81dc6c7f 84637b1d f0de09cd
00000001 00000000 00000000 00000000 00000000 00000000 00000000 00000000
使用-O3
, gcc 生成 SSE 代码:
00000000000010a0 <main>:
10a0: f3 0f 1e fa endbr64
10a4: 41 54 push %r12
10a6: 66 0f ef c0 pxor %xmm0,%xmm0
10aa: b9 64 00 00 00 mov $0x64,%ecx
10af: 45 31 c9 xor %r9d,%r9d
10b2: 55 push %rbp
10b3: 31 f6 xor %esi,%esi
10b5: 31 ed xor %ebp,%ebp
10b7: 45 31 db xor %r11d,%r11d
10ba: 53 push %rbx
10bb: 45 31 c0 xor %r8d,%r8d
10be: 31 db xor %ebx,%ebx
10c0: 45 31 d2 xor %r10d,%r10d
10c3: bf 01 00 00 00 mov $0x1,%edi
10c8: 48 83 ec 60 sub $0x60,%rsp
10cc: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
10d3: 00 00
10d5: 48 89 44 24 58 mov %rax,0x58(%rsp)
10da: 31 c0 xor %eax,%eax
10dc: 0f 29 44 24 30 movaps %xmm0,0x30(%rsp)
10e1: c7 44 24 24 00 00 00 movl $0x0,0x24(%rsp)
10e8: 00
10e9: 48 c7 44 24 50 00 00 movq $0x0,0x50(%rsp)
10f0: 00 00
10f2: c7 44 24 30 01 00 00 movl $0x1,0x30(%rsp)
10f9: 00
10fa: 0f 29 44 24 40 movaps %xmm0,0x40(%rsp)
10ff: 90 nop
1100: 89 f8 mov %edi,%eax
1102: 4d 69 c9 d1 03 00 00 imul $0x3d1,%r9,%r9
1109: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
1110: 89 c7 mov %eax,%edi
1112: 48 c1 e8 20 shr $0x20,%rax
1116: 48 89 c2 mov %rax,%rdx
1119: 44 89 d0 mov %r10d,%eax
111c: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
1123: 48 01 d0 add %rdx,%rax
1126: 41 89 c2 mov %eax,%r10d
1129: 48 c1 e8 20 shr $0x20,%rax
112d: 48 89 c2 mov %rax,%rdx
1130: 44 89 c0 mov %r8d,%eax
1133: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
113a: 48 01 d0 add %rdx,%rax
113d: 41 89 c0 mov %eax,%r8d
1140: 48 c1 e8 20 shr $0x20,%rax
1144: 48 89 c2 mov %rax,%rdx
1147: 44 89 d8 mov %r11d,%eax
114a: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
1151: 48 01 d0 add %rdx,%rax
1154: 41 89 c3 mov %eax,%r11d
1157: 48 c1 e8 20 shr $0x20,%rax
115b: 48 89 c2 mov %rax,%rdx
115e: 89 f0 mov %esi,%eax
1160: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
1167: 48 01 d0 add %rdx,%rax
116a: 89 c6 mov %eax,%esi
116c: 48 c1 e8 20 shr $0x20,%rax
1170: 48 89 c2 mov %rax,%rdx
1173: 89 e8 mov %ebp,%eax
1175: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
117c: 48 01 d0 add %rdx,%rax
117f: 89 c5 mov %eax,%ebp
1181: 48 c1 e8 20 shr $0x20,%rax
1185: 48 89 c2 mov %rax,%rdx
1188: 89 d8 mov %ebx,%eax
118a: 48 69 c0 d1 03 00 00 imul $0x3d1,%rax,%rax
1191: 48 01 d0 add %rdx,%rax
1194: 89 c3 mov %eax,%ebx
1196: 48 c1 e8 20 shr $0x20,%rax
119a: 4c 01 c8 add %r9,%rax
119d: 41 89 c1 mov %eax,%r9d
11a0: 83 e9 01 sub $0x1,%ecx
11a3: 0f 85 57 ff ff ff jne 1100 <main+0x60>
11a9: 66 41 0f 6e c8 movd %r8d,%xmm1
11ae: 66 41 0f 6e d3 movd %r11d,%xmm2
11b3: 66 0f 6e c7 movd %edi,%xmm0
11b7: 48 c1 e8 20 shr $0x20,%rax
11bb: 66 41 0f 6e da movd %r10d,%xmm3
11c0: 66 0f 62 ca punpckldq %xmm2,%xmm1
11c4: 66 0f 6e ed movd %ebp,%xmm5
11c8: 89 44 24 20 mov %eax,0x20(%rsp)
11cc: 66 0f 62 c3 punpckldq %xmm3,%xmm0
11d0: 66 41 0f 6e e1 movd %r9d,%xmm4
11d5: 4c 8d 64 24 20 lea 0x20(%rsp),%r12
11da: 66 0f 6c c1 punpcklqdq %xmm1,%xmm0
11de: 66 0f 6e cb movd %ebx,%xmm1
11e2: 48 8d 2d 1b 0e 00 00 lea 0xe1b(%rip),%rbp # 2004 <_IO_stdin_used+0x4>
11e9: 48 89 e3 mov %rsp,%rbx
11ec: 0f 29 04 24 movaps %xmm0,(%rsp)
11f0: 66 0f 6e c6 movd %esi,%xmm0
11f4: 66 0f 62 cc punpckldq %xmm4,%xmm1
11f8: 66 0f 62 c5 punpckldq %xmm5,%xmm0
11fc: 66 0f 6c c1 punpcklqdq %xmm1,%xmm0
1200: 0f 29 44 24 10 movaps %xmm0,0x10(%rsp)
1205: 0f 1f 00 nopl (%rax)
1208: 8b 13 mov (%rbx),%edx
120a: 48 89 ee mov %rbp,%rsi
120d: bf 01 00 00 00 mov $0x1,%edi
1212: 31 c0 xor %eax,%eax
1214: 48 83 c3 04 add $0x4,%rbx
1218: e8 73 fe ff ff callq 1090 <__printf_chk@plt>
121d: 49 39 dc cmp %rbx,%r12
1220: 75 e6 jne 1208 <main+0x168>
1222: bf 0a 00 00 00 mov $0xa,%edi
1227: 48 8d 5c 24 30 lea 0x30(%rsp),%rbx
122c: 4c 8d 64 24 50 lea 0x50(%rsp),%r12
1231: e8 3a fe ff ff callq 1070 <putchar@plt>
1236: 48 8d 2d c7 0d 00 00 lea 0xdc7(%rip),%rbp # 2004 <_IO_stdin_used+0x4>
123d: 0f 1f 00 nopl (%rax)
1240: 8b 13 mov (%rbx),%edx
1242: 48 89 ee mov %rbp,%rsi
1245: bf 01 00 00 00 mov $0x1,%edi
124a: 31 c0 xor %eax,%eax
124c: 48 83 c3 04 add $0x4,%rbx
1250: e8 3b fe ff ff callq 1090 <__printf_chk@plt>
1255: 49 39 dc cmp %rbx,%r12
1258: 75 e6 jne 1240 <main+0x1a0>
125a: bf 0a 00 00 00 mov $0xa,%edi
125f: e8 0c fe ff ff callq 1070 <putchar@plt>
1264: 48 8b 44 24 58 mov 0x58(%rsp),%rax
1269: 64 48 33 04 25 28 00 xor %fs:0x28,%rax
1270: 00 00
1272: 75 0b jne 127f <main+0x1df>
1274: 48 83 c4 60 add $0x60,%rsp
1278: 31 c0 xor %eax,%eax
127a: 5b pop %rbx
127b: 5d pop %rbp
127c: 41 5c pop %r12
127e: c3 retq
127f: e8 fc fd ff ff callq 1080 <__stack_chk_fail@plt>
1284: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
128b: 00 00 00
128e: 66 90 xchg %ax,%ax
但是,我对 asm 的了解还不够,无法找出问题所在。
编译器版本和 cpu 标志:
khaotik@KKST2:~/tmp$ gcc --version
gcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
khaotik@KKST2:~/tmp$ lscpu | grep Flags
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d