Godbolt 链接:https ://godbolt.org/g/Hv6MAL
typedef int cell;
cell y;
const cell *phys_addr = (const cell*)0x12340;
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
y = subsubarray[k];
}
}
}
}
期望编译器将上述代码优化为类似于以下内容的感觉是很自然的:
int main() {
for (int i = 0; i < 20; i++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
for (int j = 0; j < 30; j++) {
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
for (int k = 0; k < 50; k++) {
y = subsubarray[k];
}
}
}
}
但是由 gcc 8.2 生成的带有-O3 -m32
as 标志的程序集是:
push ebp
push edi
push esi
push ebx
sub esp, 8
mov eax, DWORD PTR phys_addr
mov DWORD PTR [esp], 0
mov DWORD PTR [esp+4], eax
mov ebp, eax
.L4:
xor esi, esi
.L3:
lea edi, [0+esi*4]
xor eax, eax
.L2:
mov edx, DWORD PTR [ebp+0]
mov ecx, DWORD PTR [esp+4]
shr edx, 2
add edx, DWORD PTR [esp]
lea ebx, [ecx+edx*4]
lea edx, [eax+esi]
add eax, 1
mov ecx, DWORD PTR [ebx+edi]
shr ecx, 2
add edx, ecx
mov edx, DWORD PTR [ebx+edx*4]
mov DWORD PTR y, edx
cmp eax, 50
jne .L2
add esi, 1
cmp esi, 30
jne .L3
add DWORD PTR [esp], 1
mov eax, DWORD PTR [esp]
add ebp, 4
cmp eax, 20
jne .L4
add esp, 8
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
为什么编译器不将subarray
andsubsubarray
计算移到内部循环之外?
随机volatile
会变魔术
我随机添加volatile
以防止 DCE 摆脱所有代码,然后以某种方式将循环不变量从内部循环中提升出来。
int main() {
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 30; j++) {
for (int k = 0; k < 50; k++) {
const cell *subarray = (&phys_addr[i] + phys_addr[i]/sizeof(cell));
const cell *subsubarray = (&subarray[j] + subarray[j]/sizeof(cell));
volatile cell y = subsubarray[k];
}
}
}
return 0;
}
这主要不是因为y
它是一个局部变量,因为 usingstd::cout << subsubarray[k];
阻止了优化。
gcc 8.2-O3 -m32
为上述代码生成的带有 as 标志的程序集是:
main:
push ebp
push edi
xor edi, edi
push esi
push ebx
sub esp, 20
mov ebp, DWORD PTR phys_addr
.L4:
mov eax, DWORD PTR [ebp+0+edi*4]
xor ecx, ecx
shr eax, 2
add eax, edi
lea ebx, [ebp+0+eax*4]
lea esi, [ebx+200]
.L3:
mov edx, DWORD PTR [ebx+ecx*4]
mov DWORD PTR [esp], ecx
shr edx, 2
add edx, ecx
sal edx, 2
lea eax, [ebx+edx]
add edx, esi
.L2:
mov ecx, DWORD PTR [eax]
add eax, 4
mov DWORD PTR [esp+16], ecx
cmp edx, eax
jne .L2
mov ecx, DWORD PTR [esp]
add ecx, 1
cmp ecx, 30
jne .L3
add edi, 1
cmp edi, 20
jne .L4
add esp, 20
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret
循环不变量被推出内部循环。randomvolatile
做了什么来让 GCC 优化不变量?clang 6.0.0时不会发生优化。