4

I'm starting to read LLVM docs and IR documentation.

In common architectures, an asm cmp instruction "result" value is -at least- 3 bits long, let's say the first bit is the SIGN flag, the second bit is the CARRY flag and the third bit is the ZERO flag.

Question 1)

Why the IR icmp instruction result value is only i1? (you can choose only one flag)

Why doesn't IR define, let's call it a icmp2 instruction returning an i3 having SIGN,CARRY and ZERO flags?

This i3 value can be acted upon with a switch instruction, or maybe a specific br2 instruction, like:

%result = cmp2 i32 %a, i32 %b
br2 i3 %result onzero label %EQUAL, onsign label %A_LT_B
#here %a GT %b

Question 2)

Does this make sense? Could this br2 instruction help create new optimizations? i.e. remove all jmps? it is necessary or the performance gains are negligible?

The reason I'm asking this -besides not being an expert in LLVM- is because in my first tests I was expecting some kind of optimization to be made by LLVM in order to avoid making the comparison twice and also avoid all branches by using asm conditional-move instructions.

My Tests:

I've compiled with clang-LLVM this:

#include <stdlib.h>
#include <inttypes.h>
typedef int32_t i32;

i32 compare (i32 a, i32 b){
// return (a - b) & 1;
   if (a>b) return 1;
   if (a<b) return -1;
   return 0;
}

int main(int argc, char** args){
    i32 n,i;
    i32 a,b,avg;

    srand(0); //fixed seed

    for (i=0;i<500;i++){
        for (n=0;n<1e6;n++){
            a=rand();
            b=rand();
            avg+=compare(a,b);
        }
    }
    return avg;
}

Output asm is: ...

    mov r15d, -1

 ...

.LBB1_2:                                #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
    call    rand
    mov r12d, eax
    call    rand
    mov ecx, 1
    cmp r12d, eax
    jg  .LBB1_4
# BB#3:                                 #   in Loop: Header=BB1_2 Depth=2
    mov ecx, 0
    cmovl   ecx, r15d
.LBB1_4:                                # %compare.exit
                                        #   in Loop: Header=BB1_2 Depth=2
    add ebx, ecx

...

I expected (all jmps removed in the inner loop):

    mov r15d, -1
    mov r13d, 1  # HAND CODED

    call    rand
    mov r12d, eax
    call    rand

    xor ecx,ecx            # HAND CODED
    cmp r12d, eax
    cmovl   ecx, r15d      # HAND CODED
    cmovg   ecx, r13d      # HAND CODED
    add ebx, ecx

Performance difference (1s) seems to be negligible (on a VM under VirtualBox):

  • LLVM generated asm: 12.53s
  • hancoded asm: 11.53s
  • diff: 1s, in 500 millions iterations

Question 3)

Are my performance measures correct? Here's the makefile and the full hancoded.compare.s

makefile:

CC=clang -mllvm --x86-asm-syntax=intel

all:
    $(CC) -S -O3 compare.c 
    $(CC) compare.s -o compare.test

    $(CC) handcoded.compare.s -o handcoded.compare.test

    echo `time ./compare.test`
    echo `time ./handcoded.compare.test`
    echo `time ./compare.test`
    echo `time ./handcoded.compare.test`

hand coded (fixed) asm:

    .text
    .file   "handcoded.compare.c"
    .globl  compare
    .align  16, 0x90
    .type   compare,@function
compare:                                # @compare
    .cfi_startproc
# BB#0:
    mov eax, 1
    cmp edi, esi
    jg  .LBB0_2
# BB#1:
    xor ecx, ecx
    cmp edi, esi
    mov eax, -1
    cmovge  eax, ecx
.LBB0_2:
    ret
.Ltmp0:
    .size   compare, .Ltmp0-compare
    .cfi_endproc

    .globl  main
    .align  16, 0x90
    .type   main,@function
main:                                   # @main
    .cfi_startproc
# BB#0:
    push    rbp
.Ltmp1:
    .cfi_def_cfa_offset 16
    push    r15
.Ltmp2:
    .cfi_def_cfa_offset 24
    push    r14
.Ltmp3:
    .cfi_def_cfa_offset 32
    push    r12
.Ltmp4:
    .cfi_def_cfa_offset 40
    push    rbx
.Ltmp5:
    .cfi_def_cfa_offset 48
.Ltmp6:
    .cfi_offset rbx, -48
.Ltmp7:
    .cfi_offset r12, -40
.Ltmp8:
    .cfi_offset r14, -32
.Ltmp9:
    .cfi_offset r15, -24
.Ltmp10:
    .cfi_offset rbp, -16
    xor r14d, r14d
    xor edi, edi
    call    srand
    mov r15d, -1
    mov r13d, 1  # HAND CODED

                                        # implicit-def: EBX
    .align  16, 0x90
.LBB1_1:                                # %.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB1_2 Depth 2
    mov ebp, 1000000
    .align  16, 0x90
.LBB1_2:                                #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2

    call    rand
    mov r12d, eax
    call    rand

    xor ecx,ecx     #hand coded
    cmp r12d, eax  

    cmovl   ecx, r15d #hand coded
    cmovg   ecx, r13d #hand coded
    add ebx, ecx

.LBB1_3:
    dec ebp
    jne .LBB1_2
# BB#5:                                 #   in Loop: Header=BB1_1 Depth=1
    inc r14d
    cmp r14d, 500
    jne .LBB1_1
# BB#6:
    mov eax, ebx
    pop rbx
    pop r12
    pop r14
    pop r15
    pop rbp
    ret
.Ltmp11:
    .size   main, .Ltmp11-main
    .cfi_endproc


    .ident  "Debian clang version 3.5.0-1~exp1 (trunk) (based on LLVM 3.5.0)"
    .section    ".note.GNU-stack","",@progbits
4

1 回答 1

1

问题 1:LLVM IR 独立于机器。有些机器甚至可能没有进位标志,甚至没有零标志或符号标志。返回值是 i1,它足以指示 TRUE 或 FALSE。您可以设置比较条件,例如'eq',然后检查结果以查看两个操作数是否相等等。

问题 2:LLVM IR 最初并不关心优化。主要目标是生成基于静态单一分配 (SSA) 的指令表示。优化发生在后面的通道中,其中一些是机器独立的,有些是机器相关的。您的 br2 想法将假设机器将支持这 3 个标志,这可能是错误的假设,

问题 3:我不确定您在这里要做什么。你能解释更多吗?

于 2017-08-31T08:33:01.397 回答