c++ - 整数类包装器性能

Question

我正在寻找为固定点数改造现有库。目前，该库只是在 32 位有符号整数上运行的命名空间函数。我想扭转这种局面并创建一个包装整数的定点类，但不想为这种细粒度的东西支付与类相关的任何性能损失，因为性能是用例的一个问题。

由于预期类具有如此简单的数据要求，并且没有资源，我认为可以使类“面向值”，利用非修改操作并在合理的情况下按值传递实例。如果实现，这将是一个简单的类，而不是层次结构的一部分。

我想知道是否可以编写一个整数包装类，与使用原始整数相比，不会产生真正的性能损失。我几乎确信情况就是这样，但对编译过程了解得不够多，只能跳进去。

我知道据说 stl 迭代器被编译为简单的指针操作，并且只想对整数操作做类似的事情。

无论如何，该库将作为项目的一部分更新到 c++11，所以我希望至少使用 constexpr 和其他新功能（如右值引用），我可以将此类的性能推到接近纯整数的性能操作。

此外，任何对两种实现之间的性能差异进行基准测试的建议都将不胜感激。

score 3 · Accepted Answer

这个问题的有趣之处在于它是如此依赖于编译器。使用 Clang/LLVM：

#include <iostream>
using namespace std;

inline int foo(int a) { return a << 1; }

struct Bar
{
    int a;

    Bar(int x) : a(x) {}

    Bar baz() { return a << 1; }
};

void out(int x) __attribute__ ((noinline));
void out(int x) { cout << x; }

void out(Bar x) __attribute__ ((noinline));
void out(Bar x) { cout << x.a; }

void f1(int x) __attribute ((noinline));
void f1(int x) { out(foo(x)); }

void f2(Bar b) __attribute ((noinline));
void f2(Bar b) { out(b.baz()); }

int main(int argc, char** argv)
{
    f1(argc);
    f2(argc);
}

给出以下 IR：

define void @_Z3outi(i32 %x) uwtable noinline {
  %1 = tail call %"class.std::basic_ostream"*
                 @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cout, i32 %x)
  ret void
}

define void @_Z3out3Bar(i32 %x.coerce) uwtable noinline {
  %1 = tail call %"class.std::basic_ostream"*
                 @_ZNSolsEi(%"class.std::basic_ostream"* @_ZSt4cout, i32 %x.coerce)
  ret void
}

define void @_Z2f1i(i32 %x) uwtable noinline {
  %1 = shl i32 %x, 1
  tail call void @_Z3outi(i32 %1)
  ret void
}

define void @_Z2f23Bar(i32 %b.coerce) uwtable noinline {
  %1 = shl i32 %b.coerce, 1
  tail call void @_Z3out3Bar(i32 %1)
  ret void
}

不出所料，生成的程序集完全相同：

    .globl  _Z2f1i
    .align  16, 0x90
    .type   _Z2f1i,@function
_Z2f1i:                                 # @_Z2f1i
.Ltmp6:
    .cfi_startproc
# BB#0:
    addl    %edi, %edi
    jmp _Z3outi                 # TAILCALL
.Ltmp7:
    .size   _Z2f1i, .Ltmp7-_Z2f1i
.Ltmp8:
    .cfi_endproc
.Leh_func_end2:


    .globl  _Z2f23Bar
    .align  16, 0x90
    .type   _Z2f23Bar,@function
_Z2f23Bar:                              # @_Z2f23Bar
.Ltmp9:
    .cfi_startproc
# BB#0:
    addl    %edi, %edi
    jmp _Z3out3Bar              # TAILCALL
.Ltmp10:
    .size   _Z2f23Bar, .Ltmp10-_Z2f23Bar
.Ltmp11:
    .cfi_endproc
.Leh_func_end3:

通常，只要对类的方法进行内联，this就可以很容易地省略参数和引用。我不太明白 gcc 怎么会搞砸这个。

score 1 · Accepted Answer

使用值语义实现定点算术将产生较差的性能，因为......

#include <iostream>
using namespace std;

inline int foo(int a) { return a << 1; }

struct Bar
{
    int a;

    Bar(int x) : a(x) {}

    Bar baz() { return a << 1; }
};

void out(int x) __attribute__ ((noinline));
void out(int x) { cout << x; }

void out(Bar x) __attribute__ ((noinline));
void out(Bar x) { cout << x.a; }

void f1(int x) __attribute ((noinline));
void f1(int x) { out(foo(x)); }

void f2(Bar b) __attribute ((noinline));
void f2(Bar b) { out(b.baz()); }

int main(int argc, char** argv)
{
    f1(argc);
    f2(argc);
}

现在让我们看看f1和f2的反汇编...

00000000004006e0 <f1(int)>:
  4006e0:   01 ff                   add    edi,edi
  4006e2:   e9 d9 ff ff ff          jmp    4006c0 <out(int)>
  4006e7:   66 0f 1f 84 00 00 00    nop    WORD PTR [rax+rax*1+0x0]
  4006ee:   00 00 

00000000004006f0 <f2(Bar)>:
  4006f0:   48 83 ec 08             sub    rsp,0x8
  4006f4:   01 ff                   add    edi,edi
  4006f6:   e8 d5 ff ff ff          call   4006d0 <out(Bar)>
  4006fb:   48 83 c4 08             add    rsp,0x8
  4006ff:   c3                      ret

如您所见， f2 对堆栈指针有一些额外的干扰，这也阻止了 ret 被省略。

（这是 -O3 的 g++ 4.6.1）

c++ - 整数类包装器性能

2 回答 2

Related

Reference