我正在尝试使用词法分析器,我发现在程序的一部分中从 while 循环切换到 if 语句和 do-while 循环导致代码速度提高了约 20%,这看起来很疯狂。我将编译器生成的代码中的差异与这些程序集片段隔离开来。有谁知道为什么快速代码更快?
在程序集中,'edi' 是当前文本位置,'ebx' 是文本结尾,'isAlpha' 是一个查找表,如果字符是字母则为 1,否则为 0。
慢代码:
slow_loop:
00401897 cmp edi,ebx
00401899 je slow_done (4018AAh)
0040189B movzx eax,byte ptr [edi]
0040189E cmp byte ptr isAlpha (4533E0h)[eax],0
004018A5 je slow_done (4018AAh)
004018A7 inc edi
004018A8 jmp slow_loop (401897h)
slow_done:
快速代码:
fast_loop:
0040193D inc edi
0040193E cmp edi,ebx
00401940 je fast_done (40194Eh)
00401942 movzx eax,byte ptr [edi]
00401945 cmp byte ptr isAlpha (4533E0h)[eax],0
0040194C jne fast_loop (40193Dh)
fast_done:
如果我只针对仅包含字母“a”的兆字节文本运行这些程序集片段,那么快速代码将快 30%。我的猜测是由于分支预测错误,慢代码很慢,但我认为在循环中这将是一次性成本。
这是我用来测试两个片段的程序:
#include <Windows.h>
#include <string>
#include <iostream>
int main( int argc, char* argv[] )
{
static char isAlpha[256];
for ( int i = 0; i < sizeof( isAlpha ); ++i )
isAlpha[i] = isalpha( i ) ? 1 : 0;
std::string test( 1024*1024, 'a' );
const char* start = test.c_str();
const char* limit = test.c_str() + test.size();
DWORD slowStart = GetTickCount();
for ( int i = 0; i < 10000; ++i )
{
__asm
{
mov edi, start
mov ebx, limit
inc edi
slow_loop:
cmp edi,ebx
je slow_done
movzx eax,byte ptr [edi]
cmp byte ptr isAlpha [eax],0
je slow_done
inc edi
jmp slow_loop
slow_done:
}
}
DWORD slowEnd = GetTickCount();
std::cout << "slow in " << ( slowEnd - slowStart ) << " ticks" << std::endl;
DWORD fastStart = GetTickCount();
for ( int i = 0; i < 10000; ++i )
{
__asm
{
mov edi, start
mov ebx, limit
fast_loop:
inc edi
cmp edi,ebx
je fast_done
movzx eax,byte ptr [edi]
cmp byte ptr isAlpha [eax],0
jne fast_loop
fast_done:
}
}
DWORD fastEnd = GetTickCount();
std::cout << "fast in " << ( fastEnd - fastStart ) << " ticks" << std::endl;
return 0;
}
测试程序的输出是
slow in 8455 ticks
fast in 5694 ticks