我正在学习 x64 编程以及英特尔 C++ 编译器和 GCC 之间的差异以及它们如何优化指令
问题:
告诉英特尔编译器转储汇编代码的最佳方法是什么(类似于 gcc -S)?现在我在 Visual Studio 中调试和反汇编以查看说明。
反汇编的 Intel 编译的 psum1 不遵守在寄存器 rdi、rsi、rdx、rcx、r8、r9 上传递参数的约定,就像在 GCC 汇编器输出中可以看到的那样。我在这里想念什么?
出于某种原因,英特尔编译器不会优化内存访问,我需要更改哪些设置?
//intel compiler /Ox output p[i] = p[i-1] + a[i]; 000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8] 000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4] 000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0 000000013F79119D addss xmm0,xmm1 000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0 //GCC -O3 ouput LBB1_3: decq %rdx LBB1_2: addq $4, %rsi addq $4, %rdi addss (%rdi), %xmm0 movss %xmm0, (%rsi) testq %rdx, %rdx jne LBB1_3 LBB1_4:
原始C代码
void psum1( float a[], float p[], long int n ) {
long int i;
p[0] = a[0];
for (i=1; i<n; i++) {
p[i] = p[i-1] + a[i];
}
}
在 Visual Studio 2010 上从 Intel C++ Compiler 2013 反汇编:
- 全面优化/Ox
- 启用内在函数 /Oi
速度/Ot
void psum1( float a[], float p[], long int n) { long int i;
p[0] = a[0]; 000000013F791156 movss xmm0,dword ptr [rcx] 000000013F79115A mov dword ptr [rdx],eax for( i=1; i<n; i++ ) { 000000013F79115C jle psum1+7Ah (13F7911CAh) 000000013F79115E mov eax,1 000000013F791163 lea r10d,[r8-1] 000000013F791167 mov r11d,r10d 000000013F79116A xor r9d,r9d 000000013F79116D shr r11d,1Fh 000000013F791171 lea r8d,[r11+r8-1] 000000013F791176 sar r8d,1 000000013F791179 test r8d,r8d 000000013F79117C jbe psum1+5Eh (13F7911AEh) p[i] = p[i-1] + a[i]; 000000013F79117E lea eax,[r9+r9] for( i=1; i<n; i++ ) { 000000013F791182 inc r9d p[i] = p[i-1] + a[i]; 000000013F791185 movsxd rax,eax for( i=1; i<n; i++ ) { 000000013F791188 cmp r9d,r8d p[i] = p[i-1] + a[i]; 000000013F79118B movss xmm1,dword ptr [rcx+rax*4+8] 000000013F791191 addss xmm0,dword ptr [rcx+rax*4+4] 000000013F791197 movss dword ptr [rdx+rax*4+4],xmm0 000000013F79119D addss xmm0,xmm1 000000013F7911A1 movss dword ptr [rdx+rax*4+8],xmm0 for( i=1; i<n; i++ ) { 000000013F7911A7 jb psum1+2Eh (13F79117Eh) 000000013F7911A9 lea eax,[r9+r9+1] 000000013F7911AE lea r8d,[rax-1] 000000013F7911B2 cmp r10d,r8d 000000013F7911B5 jbe psum1+7Ah (13F7911CAh) p[i] = p[i-1] + a[i]; 000000013F7911B7 movsxd rax,eax 000000013F7911BA movss xmm0,dword ptr [rdx+rax*4-4] 000000013F7911C0 addss xmm0,dword ptr [rcx+rax*4] 000000013F7911C5 movss dword ptr [rdx+rax*4],xmm0 } } 000000013F7911CA ret 000000013F7911CB nop dword ptr [rax+rax]
完全优化的 GCC 程序集输出 -O3
.section __TEXT,__text,regular,pure_instructions
.globl _psum1
.align 4, 0x90
_psum1:
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movss (%rdi), %xmm0
movss %xmm0, (%rsi)
cmpq $2, %rdx
jl LBB1_4
addq $-2, %rdx
jmp LBB1_2
.align 4, 0x90
LBB1_3:
decq %rdx
LBB1_2:
addq $4, %rsi
addq $4, %rdi
addss (%rdi), %xmm0
movss %xmm0, (%rsi)
testq %rdx, %rdx
jne LBB1_3
LBB1_4:
popq %rbp
ret
Leh_func_end1: