4

gcc 中的-ffast-math选项允许编译器重新排序浮点运算以更快地执行。

这可能会导致这些操作的结果之间存在细微差异,具体取决于指针的对齐方式。例如,在 x64 上,一些优化指令 (AVX) 在 128 位对齐的指针上更快,因此允许编译器这样做是有道理的。

这是一个简单程序的示例,它显示了现代 CPU 上的这种行为:

#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>


double sum(int size, double *a) {
        double r = 0.0;
        for (int k = 0;k < size; k ++) {
          r+= a[k];
        }
        return r;
}

void init(int size, double *a) {
  srand(0);
  for (int k = 0; k < size; k ++) {
          a[k] = ((double) rand()) / RAND_MAX;
  }
}


void test(int size, double *ref, double *arr) {
  init (size, ref);
  memcpy (arr, ref, sizeof *arr * size);

  printf("Alignment: ref:%d  arr:%d diff:%g\n",
    (int)((uintptr_t)ref % 32),
    (int)((uintptr_t)arr % 32),
    fabs(sum(size, arr) - sum(size, ref)));
}


int main(int argc, char **argv) {
  int size = argc <= 1 ? 100 : atoi(argv[1]); // (don't do that at home)
  double *array1 = malloc(size * sizeof *array1);
  double *array2 = malloc((size + 4) * sizeof *array2);
  printf("size = %d\n", size);
  if (array1 && array2) {
    for (int k = 0;k < 4;k ++){
      test(size, array1, array2 + k);
    }
  }
}

编译时可能会输出-Ofast

$ ./test 100000
size = 100000
Alignment: ref:16  arr:16 diff:0
Alignment: ref:16  arr:24 diff:7.27596e-12
Alignment: ref:16  arr:0 diff:0
Alignment: ref:16  arr:8 diff:7.27596e-12

问题

是否有一个神奇的标志礼貌地要求编译器不要生成对指针对齐敏感的代码而不完全禁止“快速数学”优化?

更多细节

这是我的 gcc 的配置:

Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 7.4.0-1ubuntu1~18.04.1' --with-bugurl=file:///usr/share/doc/gcc-7/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr --with-gcc-major-version-only --program-suffix=-7 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie --with-system-zlib --with-target-system-zlib --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)

我只是用“gcc -Ofast”编译了上面的代码。

通过查看生成的汇编代码,编译器似乎正在实现以下伪代码:

typedef struct {
    double lower;
    double upper; 
} packed_double128; 

double sum6(int size, double *a) { ... an array of size smaller than 6 and at least one in an "unrolled" fashion ... }

double sum(int size, double *a) {
    if (size == 0) {
        return 0.0;
    } else if (size - 1 <= 5) {
        int delta;
        double tmp; 
        if (((intptr_t) a) % 16) {
           delta = 1; 
           tmp = a[0];
        } else {
            delta = 0; 
            tmp = 0.0;
        }        
        packed_double128 res = {0., 0.}; 
        double *p = a + delta; 
        assert (((intptr_t) p) % 16 == 0); // p is aligned !
        for (k = 0; k < (size - delta) / 2; k ++) {
            res.lower += *p++;
            res.upper += *p++;            
        }
        double res = res.lower + res.upper + tmp; 
        int remain = size - 2*((size - delta) / 2); 
        if (remain) 
          return res + sum6(remain, p); 
        else 
          return res; 
    } else {
        return sum6(size, a); 
    }
}

例如,如果 a 是数组 {0, 1, 2,.., 10},则sum计算:

  • ((0+2+4+6+8)+(1+3+5+7+9))+10 如果它是“对齐的”

  • ((2+4+6+8+10)+(1+3+5+7+9))+1 如果 a 没有“对齐”

这是汇编代码(评论是我的,可能是错误的):

sum: // double sum (int size, double *a)
.LFB61:
    .cfi_startproc
    testl   %edi, %edi       // size = 0 ?
    jle .L8 ;
    movq    %rsi, %rax  ;    // rax = a
    leal    -1(%rdi), %edx ; // edx = size - 1
    shrq    $3, %rax         // rax = rax >> 3
    andl    $1, %eax         // rax = rax % 2
    // now eax contains either 1 or 0 depending the alignment, let's call this value "delta"
    cmpl    $5, %edx         // size - 1 <= 5 ?
    jbe .L9
    testl   %eax, %eax       // a % 16 = 0 ? if so, skip the next two mov (and initialize xmm2 to 0)
    je  .L10
    movsd   (%rsi), %xmm2    // xmm2 = a[0] (we put aside the first element)
    movl    $1, %ecx         // ecx = 1
.L4:
    movl    %edi, %r9d          // r9d = size
    pxor    %xmm1, %xmm1        // xmmm1 = 0
    subl    %eax, %r9d          // r9d = size - delta
    leaq    (%rsi,%rax,8), %rdx // rdx = a + eax * 8 (which is "&a[rax + delta]")
    xorl    %eax, %eax          // eax = 0
    movl    %r9d, %r8d
    shrl    %r8d                // r8d = (size - delta) / 2
    .p2align 4,,10
    .p2align 3
.L5: // HERE IS THE "real loop"
    addl    $1, %eax            // eax ++
    addpd   (%rdx), %xmm1       // xmm1 += *rdx (thas's adding 2 doubles into xmm1)
    addq    $16, %rdx           // rdx += 2 * sizeof(double)
    cmpl    %r8d, %eax          // eax < (size - delta) / 2 ?
    jb  .L5
    // Here xmm1 contains two halves of our sum:
    // one double is the sum of odds elements and the other the sums of even elements
    movdqa  %xmm1, %xmm0        // xmm0 = xmm1
    movl    %r9d, %edx          // edx = size - delta
    andl    $-2, %edx           // edx = r9d - (r9d % 2)
    psrldq  $8, %xmm0           // xmm0[0] = xmm0[1]; xmm0[1] = 0.0;
    addpd   %xmm0, %xmm1        // xmm1 += xmm0 (which now means that xmm1[0] contains the final result \o/ )
    cmpl    %edx, %r9d          // r9d = r9d - (r9d % 2) ? eg. r9d % 2 = 0 ?
    leal    (%rdx,%rcx), %eax   // eax = rdx + 1
    movapd  %xmm1, %xmm0
    addsd   %xmm2, %xmm0        // xmm0 = xmm1 + xmm2[0] Here we add the skipped element in case of misalignment
    je  .L13
.L3: // BORING UNROLLED LOOP FOR SMALL ARRAYS (size <= 6)
    movslq  %eax, %rdx
    addsd   (%rsi,%rdx,8), %xmm0
    leal    1(%rax), %edx
    cmpl    %edx, %edi
    jle .L1
    movslq  %edx, %rdx
    addsd   (%rsi,%rdx,8), %xmm0
    test    2(%rax), %edx
    cmpl    %edx, %edi
    jle .L1
    movslq  %edx, %rdx
    addsd   (%rsi,%rdx,8), %xmm0
    leal    3(%rax), %edx
    cmpl    %edx, %edi
    jle .L1
    movslq  %edx, %rdx
    addsd   (%rsi,%rdx,8), %xmm0
    leal    4(%rax), %edx
    cmpl    %edx, %edi
    jle .L1
    addl    $5, %eax
    movslq  %edx, %rdx
    cmpl    %eax, %edi
    addsd   (%rsi,%rdx,8), %xmm0
    jle .L1
    cltq
    addsd   (%rsi,%rax,8), %xmm0
    ret
    .p2align 4,,10
    .p2align 3
.L8:
    pxor    %xmm0, %xmm0
.L1:
    rep ret
    .p2align 4,,10
    .p2align 3
.L10:
    xorl    %ecx, %ecx
    pxor    %xmm2, %xmm2
    jmp .L4
    .p2align 4,,10
    .p2align 3
.L13:
    rep ret
    .p2align 4,,10
    .p2align 3
.L9:
    xorl    %eax, %eax
    pxor    %xmm0, %xmm0
    jmp .L3
    .cfi_endproc
4

0 回答 0