3

通常我会让编译器完成优化复杂逻辑表达式的魔法,但是,在这种情况下,我必须使用的编译器在这方面不是很擅长(基本上它所能做的就是用位移替换 /64 之类的东西和%512 按位与)。

是否有任何可用的工具可以分析和提供表达式的优化版本(即与优秀的优化编译器相同的方式)?

例如,我想优化以下内容:

int w = 2 - z/2;

int y0 = y + (((v % 512) / 64) / 4) * 8           + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;

int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
4

2 回答 2

1

这是一个测试:

typedef int MyInt; // or unsigned int

MyInt get(MyInt x, MyInt y, MyInt z, MyInt v, MyInt mb)
{
    MyInt w = 2 - z/2;

    MyInt y0 = y + (((v % 512) / 64) / 4) * 8           + ((v / 512) / mb)*16;
    MyInt x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;

    MyInt i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);

    return i;
}

我用 GCC 4.7.0 和-O3.

int

.LFB0:
        movl    %ecx, %eax
        movq    %r12, -24(%rsp)
.LCFI0:
        movl    %edx, %r12d
        sarl    $31, %eax
        shrl    $31, %r12d
        movq    %r13, -16(%rsp)
        shrl    $23, %eax
        addl    %edx, %r12d
        movq    %rbx, -40(%rsp)
        leal    (%rcx,%rax), %r9d
        movl    %r12d, %r11d
        movq    %r14, -8(%rsp)
        sarl    %r11d
        movq    %rbp, -32(%rsp)
.LCFI1:
        movl    %edx, %ebp
        andl    $511, %r9d
        negl    %r11d
        subl    %eax, %r9d
        leal    511(%rcx), %eax
        testl   %ecx, %ecx
        leal    2(%r11), %r13d
        leal    63(%r9), %ebx
        cmovns  %ecx, %eax
        sarl    $9, %eax
        movl    %r13d, %r14d
        xorl    $3, %r14d
        movl    %eax, %edx
        testl   %r9d, %r9d
        cmovns  %r9d, %ebx
        sarl    $31, %edx
        addl    $1, %r11d
        idivl   %r8d
        movl    %ebx, %r10d
        sarl    $31, %ebx
        shrl    $30, %ebx
        sarl    $6, %r10d
        addl    %ebx, %r10d
        andl    $3, %r10d
        subl    %ebx, %r10d
        movq    -40(%rsp), %rbx
        sall    $3, %r10d
        sall    $3, %edx
        imull   %r11d, %r10d
        imull   %r13d, %edx
        movq    -16(%rsp), %r13
        addl    %edi, %r10d
        addl    %edx, %r10d
        leal    255(%r9), %edx
        imull   %r10d, %r14d
        testl   %r9d, %r9d
        cmovs   %edx, %r9d
        sall    $4, %eax
        sarl    %r12d
        sarl    $8, %r9d
        leal    (%rsi,%r9,8), %ecx
        addl    %eax, %ecx
        leal    -3(%rbp,%rbp), %eax
        movq    -32(%rsp), %rbp
        imull   %r8d, %ecx
        imull   %r12d, %eax
        movq    -24(%rsp), %r12
        sall    $4, %ecx
        addl    %r14d, %ecx
        movq    -8(%rsp), %r14
        leal    (%rax,%rcx,2), %eax
        ret

unsigned int

.LFB0:
        movl    %ecx, %eax
        movq    %rbp, -16(%rsp)
        movl    %edx, %r11d
.LCFI0:
        movl    %edx, %ebp
        shrl    $9, %eax
        xorl    %edx, %edx
        divl    %r8d
        movq    %r12, -8(%rsp)
.LCFI1:
        movl    %ecx, %r12d
        shrl    %r11d
        andl    $511, %r12d
        movq    %rbx, -24(%rsp)
.LCFI2:
        movl    $2, %r10d
        movl    %r12d, %r9d
        movl    $1, %ebx
        subl    %r11d, %r10d
        shrl    $6, %r9d
        subl    %r11d, %ebx
        shrl    $8, %r12d
        andl    $3, %r9d
        sall    $4, %r8d
        imull   %ebx, %r9d
        leal    (%r12,%rax,2), %eax
        movq    -24(%rsp), %rbx
        imull   %r10d, %edx
        xorl    $3, %r10d
        movq    -8(%rsp), %r12
        leal    (%rsi,%rax,8), %eax
        addl    %edx, %r9d
        leal    (%rdi,%r9,8), %edi
        imull   %eax, %r8d
        leal    -3(%rbp,%rbp), %eax
        movq    -16(%rsp), %rbp
        imull   %r10d, %edi
        imull   %r11d, %eax
        addl    %edi, %r8d
        leal    (%rax,%r8,2), %eax
        ret

通过手动折叠常量来进一步“优化”(可以预见)没有进一步的影响。

于 2012-09-04T08:57:28.483 回答
1

当我想要优化时,我倾向于检查 Clang 生成的 LLVM IR。它比纯汇编更具可读性(我发现)。

int foo(int v, int mb, int x, int y, int z) {
  int w = 2 - z/2;

  // When you have specific constraints, tell the optimizer about it !
  if (w < 0 || w > 2) { return 0; }

  int y0 = y + (((v % 512) / 64) / 4) * 8           + ((v / 512) / mb)*16;
  int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;

  int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);

  return i;
}

转化为:

define i32 @foo(i32 %v, i32 %mb, i32 %x, i32 %y, i32 %z) nounwind uwtable readnone {
  %1 = sdiv i32 %z, 2
  %2 = sub nsw i32 2, %1
  %3 = icmp slt i32 %2, 0
  %4 = icmp slt i32 %z, -1
  %or.cond = or i1 %3, %4
  br i1 %or.cond, label %31, label %5

; <label>:5                                       ; preds = %0
  %6 = srem i32 %v, 512
  %7 = sdiv i32 %6, 64
  %8 = sdiv i32 %6, 256
  %9 = shl i32 %8, 3
  %10 = sdiv i32 %v, 512
  %11 = sdiv i32 %10, %mb
  %12 = shl i32 %11, 4
  %13 = add i32 %9, %y
  %14 = add i32 %13, %12
  %15 = srem i32 %7, 4
  %16 = add nsw i32 %2, -1
  %17 = mul i32 %16, %15
  %18 = srem i32 %10, %mb
  %19 = mul i32 %2, %18
  %tmp = add i32 %19, %17
  %tmp2 = shl i32 %tmp, 3
  %20 = add nsw i32 %tmp2, %x
  %21 = shl i32 %2, 1
  %22 = xor i32 %21, 6
  %23 = mul i32 %22, %20
  %24 = shl i32 %mb, 5
  %25 = mul i32 %24, %14
  %26 = shl i32 %z, 1
  %27 = add nsw i32 %26, -3
  %28 = mul nsw i32 %1, %27
  %29 = add i32 %25, %28
  %30 = add i32 %29, %23
  br label %31

; <label>:31                                      ; preds = %5, %0
  %.0 = phi i32 [ %30, %5 ], [ 0, %0 ]
  ret i32 %.0
}

我不知道它是否是最佳的,但它肯定是相对可读的。

如果您可以指示输入的所有约束(如果需要,所有五个约束),那就太好了,因为优化器可能能够使用它们。

于 2012-09-04T09:45:38.287 回答