optimization - GNU 内联汇编优化

Question

我正在尝试为高度优化的 x86-64 位操作代码编写一个小型库，并且正在摆弄内联 asm。

在测试这个特殊案例时引起了我的注意：

unsigned long test = 0;
unsigned long bsr;

// bit test and set 39th bit
__asm__ ("btsq\t%1, %0 " : "+rm" (test) : "rJ" (39) );

// bit scan reverse (get most significant bit id)
__asm__ ("bsrq\t%1, %0" : "=r" (bsr) : "rm" (test) );

printf("test = %lu, bsr = %d\n", test, bsr);

在 gcc 和 icc 中编译和运行都很好，但是当我检查程序集时，我发现了差异

gcc -S -fverbose-asm -std=gnu99 -O3

movq    $0, -8(%rbp)
## InlineAsm Start
btsq    $39, -8(%rbp) 
## InlineAsm End
movq    -8(%rbp), %rax
movq    %rax, -16(%rbp)
## InlineAsm Start
bsrq    -16(%rbp), %rdx
## InlineAsm End
movq    -8(%rbp), %rsi
leaq    L_.str(%rip), %rdi
xorb    %al, %al
callq   _printf

我想知道为什么这么复杂？我正在编写高性能代码，其中指令的数量至关重要。我特别想知道为什么 gcc 在将我的变量test传递给第二个内联 asm 之前会对其进行复制？

使用 icc 编译的相同代码给出了更好的结果：

    xorl      %esi, %esi                                    # test = 0
    movl      $.L_2__STRING.0, %edi                         # has something to do with printf
    orl       $32832, (%rsp)                                # part of function initiation
    xorl      %eax, %eax                                    # has something to do with printf
    ldmxcsr   (%rsp)                                        # part of function initiation
    btsq      $39, %rsi                                     #106.0
    bsrq      %rsi, %rdx                                    #109.0
    call      printf                                        #111.2

尽管 gcc 决定将我的变量保留在堆栈中而不是寄存器中，但我不明白为什么要test在将其传递给第二个 asm 之前进行复制？如果我test在第二个 asm 中作为输入/输出变量输入

__asm__ ("bsrq\t%1, %0" : "=r" (bsr) , "+rm" (test) );

然后那些线消失了。

movq    $0, -8(%rbp)
## InlineAsm Start
btsq    $39, -8(%rbp) 
## InlineAsm End
## InlineAsm Start
bsrq    -8(%rbp), %rdx
## InlineAsm End
movq    -8(%rbp), %rsi
leaq    L_.str(%rip), %rdi
xorb    %al, %al
callq   _printf

这是 gcc 搞砸了优化还是我错过了一些重要的编译器开关？我的生产系统确实有 icc，但是如果我决定在某个时候分发源代码，那么它也必须能够使用 gcc 进行编译。

使用的编译器：

gcc 版本 4.2.1（基于 Apple Inc. build 5658）（LLVM build 2336.1.00）

icc 版本 12.0.2

score 4 · Accepted Answer

test我已经像这样在 Linux 上尝试了你的示例（通过&test在printf:)中使用强制堆栈 ref/loc 使其“邪恶” ：

#include <stdio.h>
int main(int argc, char **argv)
{
    unsigned long test = 0;
    unsigned long bsr;
// bit test and set 39th bit
    asm ("btsq\t%1, %0 " : "+rm" (test) : "rJ" (39) );
// bit scan reverse (get most significant bit id)
    asm ("bsrq\t%1, %0" : "=r" (bsr) : "rm" (test) );
    printf("test = %lu, bsr = %d, &test = %p\n", test, bsr, &test);
    return 0;
}

并用各种版本的gcc -O3... 编译它，得到以下结果：

代码生成的 gcc 版本
==================================================== ===============================
  400630: 48 83 ec 18 sub $0x18,%rsp 4.7.2,
  400634: 31 c0 xor %eax,%eax 4.6.2,
  400636: bf 50 07 40 00 移动 $0x400750,%edi 4.4.6
  40063b: 48 8d 4c 24 08 lea 0x8(%rsp),%rcx
  400640: 48 0f ba e8 27 bts $0x27,%rax
  400645: 48 89 44 24 08 移动 %rax,0x8(%rsp)
  40064a: 48 89 c6 移动 %rax,%rsi
  40064d: 48 0f bd d0 bsr %rax,%rdx
  400651: 31 c0 xor %eax,%eax
  400653：e8 68 fe ff ff callq 4004c0
[ ... ]
-------------------------------------------------- -------------------------------------------
  4004f0: 48 83 ec 18 sub $0x18,%rsp 4.1
  4004f4: 31 c0 xor %eax,%eax
  4004f6: bf 28 06 40 00 移动 $0x400628,%edi
  4004fb: 48 8d 4c 24 10 lea 0x10(%rsp),%rcx
  400500: 48 c7 44 24 10 00 00 00 00 movq $0x0,0x10(%rsp)
  400509: 48 0f ba e8 27 bts $0x27,%rax
  40050e: 48 89 44 24 10 移动 %rax,0x10(%rsp)
  400513: 48 89 c6 移动 %rax,%rsi
  400516: 48 0f bd d0 bsr %rax,%rdx
  40051a: 31 c0 xor %eax,%eax
  40051c：e8 c7 fe ff ff callq 4003e8
[ ... ]
-------------------------------------------------- -------------------------------------------
  400500: 48 83 ec 08 sub $0x8,%rsp 3.4.5
  400504: bf 30 06 40 00 移动 $0x400630,%edi
  400509: 31 c0 xor %eax,%eax
  40050b: 48 c7 04 24 00 00 00 00 movq $0x0,(%rsp)
  400513: 48 89 e1 移动 %rsp,%rcx
  400516: 48 0f ba 2c 24 27 btsq $0x27,(%rsp)
  40051c: 48 8b 34 24 移动 (%rsp),%rsi
  400520: 48 0f bd 14 24 bsr (%rsp),%rdx
  400525：e8 fe fe ff ff callq 400428
[ ... ]
-------------------------------------------------- -------------------------------------------
  4004e0: 48 83 ec 08 sub $0x8,%rsp 3.2.3
  4004e4: bf 10 06 40 00 移动 $0x400610,%edi
  4004e9: 31 c0 xor %eax,%eax
  4004eb: 48 c7 04 24 00 00 00 00 movq $0x0,(%rsp)
  4004f3: 48 0f ba 2c 24 27 btsq $0x27,(%rsp)
  4004f9: 48 8b 34 24 移动 (%rsp),%rsi
  4004fd: 48 89 e1 移动 %rsp,%rcx
  400500: 48 0f bd 14 24 bsr (%rsp),%rdx
  400505：e8 ee fe ff ff callq 4003f8
[ ... ]

尽管创建的代码存在显着差异（包括访问是作为寄存器还是bsr内存test），但没有一个经过测试的版本重新创建您显示的程序集。我怀疑您在 MacOSX 上使用的 4.2.x 版本中存在错误，但是我既没有您的测试用例，也没有可用的特定编译器版本。

编辑：test上面的代码在强制进入堆栈的意义上显然是不同的；如果没有这样做，那么我测试过的所有“普通” gcc 版本都会直接配对bts $39, %rsi/ bsr %rsi, %rdx。

但是，我发现在clang那里创建了不同的代码：

140: 50 推%rax
 141: 48 c7 04 24 00 00 00 00 movq $0x0,(%rsp)
 149: 31 f6 异或 %esi,%esi
 14b: 48 0f ba ee 27 bts $0x27,%rsi
 150: 48 89 34 24 移动 %rsi,(%rsp)
 154: 48 0f bd d6 bsr %rsi,%rdx
 158: bf 00 00 00 00 移动 $0x0,%edi
 15d: 30 c0 xor %al,%al
 15f: e8 00 00 00 00 callq printf@plt>

所以差异似乎确实在 clang/llvm 的代码生成器和“gcc 正确”之间。

optimization - GNU 内联汇编优化

1 回答 1

Related

Reference