1

我正在学习 x64 编程以及英特尔 C++ 编译器和 GCC 之间的差异以及它们如何优化指令

问题:

  1. 告诉英特尔编译器转储汇编代码的最佳方法是什么(类似于 gcc -S)?现在我在 Visual Studio 中调试和反汇编以查看说明。

  2. 反汇编的 Intel 编译的 psum1 不遵守在寄存器 rdi、rsi、rdx、rcx、r8、r9 上传递参数的约定,就像在 GCC 汇编器输出中可以看到的那样。我在这里想念什么?

  3. 出于某种原因,英特尔编译器不会优化内存访问,我需要更改哪些设置?

                //intel compiler /Ox output
                p[i] = p[i-1] + a[i];
                000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
                000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
                000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
                000000013F79119D  addss       xmm0,xmm1
                000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0
    
    //GCC -O3 ouput
    LBB1_3:
    decq    %rdx
    LBB1_2:
    addq    $4, %rsi
            addq    $4, %rdi
            addss   (%rdi), %xmm0
            movss   %xmm0, (%rsi)
    testq   %rdx, %rdx
            jne LBB1_3
            LBB1_4:
    

原始C代码

void psum1( float a[], float p[], long int n ) {
    long int i;
    p[0] = a[0];
    for (i=1; i<n; i++) {
        p[i] = p[i-1] + a[i];
    }
}

在 Visual Studio 2010 上从 Intel C++ Compiler 2013 反汇编:

  • 全面优化/Ox
  • 启用内在函数 /Oi
  • 速度/Ot

    void psum1( float a[], float p[], long int n) { long int i;

    p[0] = a[0];
    000000013F791156  movss       xmm0,dword ptr [rcx]
    000000013F79115A  mov         dword ptr [rdx],eax
    
    for( i=1; i<n; i++ ) {
        000000013F79115C  jle         psum1+7Ah (13F7911CAh)
        000000013F79115E  mov         eax,1
        000000013F791163  lea         r10d,[r8-1]
        000000013F791167  mov         r11d,r10d
        000000013F79116A  xor         r9d,r9d
        000000013F79116D  shr         r11d,1Fh
        000000013F791171  lea         r8d,[r11+r8-1]
        000000013F791176  sar         r8d,1
        000000013F791179  test        r8d,r8d
        000000013F79117C  jbe         psum1+5Eh (13F7911AEh)
    
        p[i] = p[i-1] + a[i];
        000000013F79117E  lea         eax,[r9+r9]
    
        for( i=1; i<n; i++ ) {
            000000013F791182  inc         r9d
    
            p[i] = p[i-1] + a[i];
            000000013F791185  movsxd      rax,eax
    
            for( i=1; i<n; i++ ) {
                000000013F791188  cmp         r9d,r8d
    
                p[i] = p[i-1] + a[i];
                000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
                000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
                000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
                000000013F79119D  addss       xmm0,xmm1
                000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0
    
                for( i=1; i<n; i++ ) {
                    000000013F7911A7  jb          psum1+2Eh (13F79117Eh)
                    000000013F7911A9  lea         eax,[r9+r9+1]
                    000000013F7911AE  lea         r8d,[rax-1]
                    000000013F7911B2  cmp         r10d,r8d
                    000000013F7911B5  jbe         psum1+7Ah (13F7911CAh)
    
                    p[i] = p[i-1] + a[i];
                    000000013F7911B7  movsxd      rax,eax
                    000000013F7911BA  movss       xmm0,dword ptr [rdx+rax*4-4]
                    000000013F7911C0  addss       xmm0,dword ptr [rcx+rax*4]
                    000000013F7911C5  movss       dword ptr [rdx+rax*4],xmm0
                }
            }
            000000013F7911CA  ret
            000000013F7911CB  nop         dword ptr [rax+rax]
    

完全优化的 GCC 程序集输出 -O3

.section    __TEXT,__text,regular,pure_instructions
.globl  _psum1
.align  4, 0x90
_psum1:
Leh_func_begin1:
pushq   %rbp
        Ltmp0:
movq    %rsp, %rbp
        Ltmp1:
movss   (%rdi), %xmm0
        movss   %xmm0, (%rsi)
cmpq    $2, %rdx
        jl  LBB1_4
        addq    $-2, %rdx
        jmp LBB1_2
.align  4, 0x90
LBB1_3:
decq    %rdx
LBB1_2:
addq    $4, %rsi
        addq    $4, %rdi
        addss   (%rdi), %xmm0
        movss   %xmm0, (%rsi)
testq   %rdx, %rdx
        jne LBB1_3
LBB1_4:
popq    %rbp
        ret
Leh_func_end1:
4

0 回答 0