gcc 中的-ffast-math
选项允许编译器重新排序浮点运算以更快地执行。
这可能会导致这些操作的结果之间存在细微差异,具体取决于指针的对齐方式。例如,在 x64 上,一些优化指令 (AVX) 在 128 位对齐的指针上更快,因此允许编译器这样做是有道理的。
这是一个简单程序的示例,它显示了现代 CPU 上的这种行为:
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
double sum(int size, double *a) {
double r = 0.0;
for (int k = 0;k < size; k ++) {
r+= a[k];
}
return r;
}
void init(int size, double *a) {
srand(0);
for (int k = 0; k < size; k ++) {
a[k] = ((double) rand()) / RAND_MAX;
}
}
void test(int size, double *ref, double *arr) {
init (size, ref);
memcpy (arr, ref, sizeof *arr * size);
printf("Alignment: ref:%d arr:%d diff:%g\n",
(int)((uintptr_t)ref % 32),
(int)((uintptr_t)arr % 32),
fabs(sum(size, arr) - sum(size, ref)));
}
int main(int argc, char **argv) {
int size = argc <= 1 ? 100 : atoi(argv[1]); // (don't do that at home)
double *array1 = malloc(size * sizeof *array1);
double *array2 = malloc((size + 4) * sizeof *array2);
printf("size = %d\n", size);
if (array1 && array2) {
for (int k = 0;k < 4;k ++){
test(size, array1, array2 + k);
}
}
}
编译时可能会输出-Ofast
:
$ ./test 100000
size = 100000
Alignment: ref:16 arr:16 diff:0
Alignment: ref:16 arr:24 diff:7.27596e-12
Alignment: ref:16 arr:0 diff:0
Alignment: ref:16 arr:8 diff:7.27596e-12
问题
是否有一个神奇的标志礼貌地要求编译器不要生成对指针对齐敏感的代码而不完全禁止“快速数学”优化?
更多细节
这是我的 gcc 的配置:
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 7.4.0-1ubuntu1~18.04.1' --with-bugurl=file:///usr/share/doc/gcc-7/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr --with-gcc-major-version-only --program-suffix=-7 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie --with-system-zlib --with-target-system-zlib --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)
我只是用“gcc -Ofast”编译了上面的代码。
通过查看生成的汇编代码,编译器似乎正在实现以下伪代码:
typedef struct {
double lower;
double upper;
} packed_double128;
double sum6(int size, double *a) { ... an array of size smaller than 6 and at least one in an "unrolled" fashion ... }
double sum(int size, double *a) {
if (size == 0) {
return 0.0;
} else if (size - 1 <= 5) {
int delta;
double tmp;
if (((intptr_t) a) % 16) {
delta = 1;
tmp = a[0];
} else {
delta = 0;
tmp = 0.0;
}
packed_double128 res = {0., 0.};
double *p = a + delta;
assert (((intptr_t) p) % 16 == 0); // p is aligned !
for (k = 0; k < (size - delta) / 2; k ++) {
res.lower += *p++;
res.upper += *p++;
}
double res = res.lower + res.upper + tmp;
int remain = size - 2*((size - delta) / 2);
if (remain)
return res + sum6(remain, p);
else
return res;
} else {
return sum6(size, a);
}
}
例如,如果 a 是数组 {0, 1, 2,.., 10},则sum
计算:
((0+2+4+6+8)+(1+3+5+7+9))+10 如果它是“对齐的”
((2+4+6+8+10)+(1+3+5+7+9))+1 如果 a 没有“对齐”
这是汇编代码(评论是我的,可能是错误的):
sum: // double sum (int size, double *a)
.LFB61:
.cfi_startproc
testl %edi, %edi // size = 0 ?
jle .L8 ;
movq %rsi, %rax ; // rax = a
leal -1(%rdi), %edx ; // edx = size - 1
shrq $3, %rax // rax = rax >> 3
andl $1, %eax // rax = rax % 2
// now eax contains either 1 or 0 depending the alignment, let's call this value "delta"
cmpl $5, %edx // size - 1 <= 5 ?
jbe .L9
testl %eax, %eax // a % 16 = 0 ? if so, skip the next two mov (and initialize xmm2 to 0)
je .L10
movsd (%rsi), %xmm2 // xmm2 = a[0] (we put aside the first element)
movl $1, %ecx // ecx = 1
.L4:
movl %edi, %r9d // r9d = size
pxor %xmm1, %xmm1 // xmmm1 = 0
subl %eax, %r9d // r9d = size - delta
leaq (%rsi,%rax,8), %rdx // rdx = a + eax * 8 (which is "&a[rax + delta]")
xorl %eax, %eax // eax = 0
movl %r9d, %r8d
shrl %r8d // r8d = (size - delta) / 2
.p2align 4,,10
.p2align 3
.L5: // HERE IS THE "real loop"
addl $1, %eax // eax ++
addpd (%rdx), %xmm1 // xmm1 += *rdx (thas's adding 2 doubles into xmm1)
addq $16, %rdx // rdx += 2 * sizeof(double)
cmpl %r8d, %eax // eax < (size - delta) / 2 ?
jb .L5
// Here xmm1 contains two halves of our sum:
// one double is the sum of odds elements and the other the sums of even elements
movdqa %xmm1, %xmm0 // xmm0 = xmm1
movl %r9d, %edx // edx = size - delta
andl $-2, %edx // edx = r9d - (r9d % 2)
psrldq $8, %xmm0 // xmm0[0] = xmm0[1]; xmm0[1] = 0.0;
addpd %xmm0, %xmm1 // xmm1 += xmm0 (which now means that xmm1[0] contains the final result \o/ )
cmpl %edx, %r9d // r9d = r9d - (r9d % 2) ? eg. r9d % 2 = 0 ?
leal (%rdx,%rcx), %eax // eax = rdx + 1
movapd %xmm1, %xmm0
addsd %xmm2, %xmm0 // xmm0 = xmm1 + xmm2[0] Here we add the skipped element in case of misalignment
je .L13
.L3: // BORING UNROLLED LOOP FOR SMALL ARRAYS (size <= 6)
movslq %eax, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 1(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
test 2(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 3(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 4(%rax), %edx
cmpl %edx, %edi
jle .L1
addl $5, %eax
movslq %edx, %rdx
cmpl %eax, %edi
addsd (%rsi,%rdx,8), %xmm0
jle .L1
cltq
addsd (%rsi,%rax,8), %xmm0
ret
.p2align 4,,10
.p2align 3
.L8:
pxor %xmm0, %xmm0
.L1:
rep ret
.p2align 4,,10
.p2align 3
.L10:
xorl %ecx, %ecx
pxor %xmm2, %xmm2
jmp .L4
.p2align 4,,10
.p2align 3
.L13:
rep ret
.p2align 4,,10
.p2align 3
.L9:
xorl %eax, %eax
pxor %xmm0, %xmm0
jmp .L3
.cfi_endproc