c++ - 为什么自定义循环更快？编译器不好？不安全的自定义代码？运气？（幸运的缓存命中）

Question

我刚刚开始学习汇编并制作一些自定义循环，以使用 C++ 的 asm{} 主体与 C-Free 5.0 中的 Digital-Mars 编译器交换两个变量

启用 -o（优化）

并得到了结果：

 time of for-loop(cycles)        844
 time of while-loop(cycles)      735
 time of custom-loop-1(cycles)   562
 time of custom-loop-2(cycles)   469

我找不到 Digital-Mars 编译器“asm 输出”选项进行比较。构建选项中没有其他优化选项。我应该改变我的编译器吗？如果是，是哪一个？你能看看下面的代码并告诉我为什么自定义循环更快吗？

这是for循环的标准：

t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(increasing) %i  \n",(t2-t1));

这是标准的while循环：

t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
            j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));

这是我的自定义循环 1：

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

这是我的第二个自定义循环：

t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

完整代码：

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
int j=0;

int a=0,b=0,temp=0;

srand(time(0));
time_t t1=0;
time_t t2=0;


t1=clock(); 
for(int i=0;i<200000000;i++)
{
    temp=a;//instruction 1
    a=b;//instruction 2
    b=temp;//3 instructions total   
}   
t2=clock();
printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


t1=clock();
while(j<200000000)
{
    temp=a;//again it is three instructions
    a=b;
    b=temp; 
    j++;
}
t2=clock();
printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b       
        mov b,eax       
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history   
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));


t1=clock();
j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        
        push ebx        
        push ecx        
        push edx        

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        mov eax,a       //getting variables to registers
        mov ebx,b

        do_it_again2:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        sub eax,ebx         //a is now a-b
        add ebx,eax         //b is now a
        sub eax,ebx         //a is now -b
        xor eax,80000000h   //a is now b and four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again2  // end of loop block

        pop edx         //rollback
        pop ecx         
        pop ebx         
        pop eax         
        popf            
    }

t2=clock();
printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

return 0;

}

我只是在学习 c++ 和汇编，想知道事情是怎么回事。谢谢

windows xp, pentium 4 (2 GHz) Digital-Mars in C-Free

score 6 · Accepted Answer

该编译器生成的代码非常糟糕。用反汇编目标文件后objconv，这是我对第一个for循环的了解。

?_001:  cmp     dword [ebp-4H], 200000000               ; 0053 _ 81. 7D, FC, 0BEBC200
        jge     ?_002                                   ; 005A _ 7D, 17
        inc     dword [ebp-4H]                          ; 005C _ FF. 45, FC
        mov     eax, dword [ebp-18H]                    ; 005F _ 8B. 45, E8
        mov     dword [ebp-10H], eax                    ; 0062 _ 89. 45, F0
        mov     eax, dword [ebp-14H]                    ; 0065 _ 8B. 45, EC
        mov     dword [ebp-18H], eax                    ; 0068 _ 89. 45, E8
        mov     eax, dword [ebp-10H]                    ; 006B _ 8B. 45, F0
        mov     dword [ebp-14H], eax                    ; 006E _ 89. 45, EC
        jmp     ?_001                                   ; 0071 _ EB, E0

任何看过某个程序集的人都应该清楚这些问题。

循环非常紧密地依赖于放入的值eax。由于每条下一条指令在该寄存器上创建依赖关系，这使得任何乱序执行几乎不可能。
有六个通用寄存器可用（因为ebp并且esp在大多数设置中并不是真正通用的），但是您的编译器不使用它们，而是使用本地堆栈。当速度是优化目标时，这是绝对不能接受的。我们甚至可以看到当前循环索引存储在[ebp-4H]，而它本来可以很容易地存储在寄存器中。
该cmp指令使用内存和立即操作数。这是最慢的操作数组合，在性能受到威胁时绝不应使用。
不要让我开始了解代码大小。这些指令中有一半是不必要的。

总而言之，我要做的第一件事就是尽早放弃那个编译器。但是话又说回来，看到它提供“内存模型”作为其选择之一，人们似乎真的没有太大希望。

score 5 · Accepted Answer

在没有看到它创建的汇编语言结果的情况下，很难猜测你的编译器可能在做什么。使用 VC++ 10，我得到以下结果：

time of for-loop(cycles) 155

time of while-loop(cycles)  158

time of custom-loop-1(cycles)   369

time of custom-loop-2(cycles)  314

我没有看输出，但我的直接猜测是forandwhile循环之间的区别只是噪音。不过，两者显然都比您手写的汇编代码快得多。

编辑：查看汇编代码，我是对的—— thefor和 the的代码while是相同的。它看起来像这样：

        call    _clock
        mov     ecx, DWORD PTR _a$[ebp]
        cdq
        mov     ebx, edx
        mov     edx, DWORD PTR _b$[ebp]
        mov     edi, eax
        mov     esi, 200000000
$LL2@main:
; Line 28
        dec     esi
; Line 30
        mov     eax, ecx
; Line 31
        mov     ecx, edx
; Line 32
        mov     edx, eax
        jne     SHORT $LL2@main
        mov     DWORD PTR _b$[ebp], edx
        mov     DWORD PTR _a$[ebp], ecx
; Line 35
        call    _clock

虽然可以说不如第二个循环“聪明”，但现代 CPU 往往用简单的代码做得最好。它在循环内也只有更少的指令（并且根本不引用循环内的内存）。无论如何，这些都不是衡量效率的唯一指标，但是通过这个简单的循环，它们具有相当的指示性。

编辑2：

只是为了好玩，我写了一个新版本，添加了三异或交换，以及一个使用 CPUxchg指令的版本（只是因为如果我不太关心速度等，我可能会手动编写它。 ) 尽管 Intel/AMD 通常建议不要使用更复杂的指令，但它似乎不会引起问题——它的出现速度似乎至少与其他任何指令一样快：

 time of for-loop(cycles) 156

 time of while-loop(cycles)  160

 time swap between register and cache  284

 time to swap using add/sub:  308

 time to swap using xchg:  155

 time to swap using triple-xor  233

来源：

// Note: updated source -- it was just too ugly to live. Same results though.
#include<stdlib.h>
#include<time.h>
#include <iostream>
#include <string>
#include <iomanip>
#include <sstream>

namespace { 
    int a, b;
    const int loops = 200000000;
}

template <class swapper>
struct timer {
    timer(std::string const &label) { 
        clock_t t1 = clock();
        swapper()();
        clock_t t2 = clock();
        std::ostringstream buffer;
        buffer << "Time for swap using " << label;
        std::cout << std::left << std::setw(30) << buffer.str() << " = " << (t2-t1) << "\n";
    }
};

struct for_loop {
    void operator()() {
        int temp;
        for(int i=0;i<loops;i++) {
            temp=a;//instruction 1
            a=b;//instruction 2
            b=temp;//3 instructions total   
        }
    }
};

struct while_loop {
    void operator()() { 
        int j = 0;
        int temp;
        while(j<loops) {
            temp=a;//again it is three instructions
            a=b;
            b=temp; 
            j++;
        }
    }
};

struct reg_mem {
    void operator()() {
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j
    do_it_again:    //begin to loop
            mov eax,a       //basic swap steps between cpu and mem(cache)
            mov ebx,b       
            mov b,eax       
            mov a,ebx       //four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again  // end of loop block
        }
    }
};

struct add_sub {
    void operator()() { 
        int j=loops;//setting the count
        __asm {
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

    do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            mov a, eax
            mov b, ebx
        }
    }
};

struct xchg {
    void operator()() {
        __asm {
            mov ecx, loops
            mov eax, a
            mov ebx, b
    do_it_again3:
            dec ecx
            xchg eax, ebx
            jne do_it_again3
            mov a, eax
            mov b, ebx
        }
    }
};

struct xor3 {
    void operator()() { 
        _asm { 
            mov ecx, loops
            mov eax, a
            mov edx, b
    do_swap4:
            xor eax, edx
            xor edx, eax
            xor eax, edx
            dec ecx
            jnz do_swap4

            mov a, eax
            mov b, edx
        }
    }
};

int main() {
    timer<for_loop>("for loop");
    timer<while_loop>("while loop");
    timer<reg_mem>("reg<->mem");
    timer<add_sub>("add/sub");
    timer<xchg>("xchg");
    timer<xor3>("triple xor");
    return 0;
}

底线：至少对于这个微不足道的任务，你不会击败一个体面的编译器足以关心（可能根本不会，除非可能在更小的代码方面）。

score 3 · Accepted Answer

这可能是由于编译器未能使其成为寄存器操作数，而是使用间接（地址）操作数。

切换编译器<--这是你最好的优化。

更新我经历了翻译相同程序 gcc intel inline assembly: test.c的麻烦。它清楚地显示了 for 循环和 and-while 循环如何大大优于手写程序集。

也就是说，使用 Digital Mars，以下速度更快：

__asm
{
    xor ecx,j     //init of loop range(200000000 to 0)

    mov eax,a     //getting variables to registers
    mov ebx,b

do_it_again3: //begin to loop

    //swapping with xor idiom
    xor eax,ebx
    xor ebx,eax         
    xor eax,ebx         

    mov a,eax
    mov b,ebx

    dec ecx           // j--
    jnz do_it_again3  // end of loop block
}

使用

XOR交换习语
下降循环
隐式比较标志（带dec ecx）

我对 Digital Mars Compiler 版本 8.42n 的基准测试结果为：

time of for-loop(cycles) 572  
time of while-loop(cycles)  566  
time of custom-loop-1(cycles)   355   
time of custom-loop-2(cycles)  317   
time of custom-loop-3(cycles)  234

完整清单：

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

int main()
{
    int j=0;

    int a=0,b=0,temp=0;

    srand(time(0));
    time_t t1=0;
    time_t t2=0;


    t1=clock();
    for(int i=0; i<200000000; i++)
    {
        temp=a;//instruction 1
        a=b;//instruction 2
        b=temp;//3 instructions total
    }
    t2=clock();
    printf("\n time of for-loop(cycles) %i  \n",(t2-t1));


    t1=clock();
    while(j<200000000)
    {
        temp=a;//again it is three instructions
        a=b;
        b=temp;
        j++;
    }
    t2=clock();
    printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));


    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        pushf           //backup
        push eax        //backup
        push ebx        //backup
        push ecx        //backup
        push edx        //backup

        mov ecx,0       //init of loop range(0 to 200000000)
        mov edx,j

        do_it_again:    //begin to loop


        mov eax,a       //basic swap steps between cpu and mem(cache)
        mov ebx,b
        mov b,eax
        mov a,ebx       //four instructions total

        inc ecx         // j++
        cmp ecx,edx     //i<200000000  ?
        jb do_it_again  // end of loop block

        pop edx     //rolling back to history
        pop ecx
        pop ebx
        pop eax
        popf
    }

    t2=clock();
    printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));

    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        pushf           //backup
            push eax        
            push ebx        
            push ecx        
            push edx        

            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j

            mov eax,a       //getting variables to registers
            mov ebx,b

            do_it_again2:   //begin to loop

            //swapping with using only 2 variables(only in cpu)
            sub eax,ebx         //a is now a-b
            add ebx,eax         //b is now a
            sub eax,ebx         //a is now -b
            xor eax,80000000h   //a is now b and four instructions total

            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again2  // end of loop block

            pop edx         //rollback
            pop ecx         
            pop ebx         
            pop eax         
            popf            
    }

    t2=clock();
    printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));

    t1=clock();
    j=200000000;//setting the count
    __asm
    {
        xor ecx,j     //init of loop range(200000000 to 0)

        mov eax,a     //getting variables to registers
        mov ebx,b

    do_it_again3:   //begin to loop

        //swapping with using only 2 variables(only in cpu)
        xor eax,ebx
        xor ebx,eax         
        xor eax,ebx         

        mov a,eax
        mov b,ebx

        dec ecx         // j--
        jnz do_it_again3  // end of loop block
    }

    t2=clock();
    printf("\n time of custom-loop-3(cycles)  %i   \n",(t2-t1));

    return 0;

}

score 2 · Accepted Answer

我很惊讶你们中的任何人都从 C 代码中得到了零周期以外的任何东西。在这里，使用 gcc 4.6.3 和-O2，循环消失了，因为它没有副作用。除了 asm 块之外的所有内容都被删除。如果 Digital Mars 不能做如此微不足道的优化，我会感到惊讶；我敢打赌，您可以尝试不同的优化开关来删除 C 代码，此时这种微不足道的比较变得不可能。

您的玩具示例无法将编译器优化与手工组装进行比较。从统计上讲，编译器可以始终编写比人类更好的机器代码。

score 0 · Accepted Answer

这是正常的，改变编译器并不能解决这个“问题”。汇编程序非常低级，您可以控制一切。你的 C++ 编译器总是做的比它需要的多。调用函数会比汇编中花费更多的时间，因为编译器会保护堆栈（例如）。在循环中也是一样的：声明一个新变量需要更多时间，也可以添加值等等......

这个问题应该对更多信息感兴趣：汇编程序何时比 C 更快？

c++ - 为什么自定义循环更快？编译器不好？不安全的自定义代码？运气？（幸运的缓存命中）

5 回答 5

Related

Reference