c++ - 为什么简单使用 ostringstream 会生成这么多汇编代码？

Question

考虑以下简单示例，该示例使用ostringstream并丢弃输出来格式化字符串和整数：

#include <sstream>

void ostringstream_test() {
  std::ostringstream ss;
  ss << "x = " << 42;
  ss.str();
}

编译它会clang++ -S -O3 -DNDEBUG -std=c++14 test.cc生成大量的汇编代码（x86-64 指令中的半千字节，而类似sprintf代码的不到一百字节） - 请参见下面的输出。为什么它会生成这么多代码，是ostringstreamAPI 固有的还是这个特定的编译器/库做错了什么？

    .globl  __Z18ostringstream_testv
    .p2align    4, 0x90
__Z18ostringstream_testv:               ## @_Z18ostringstream_testv
Lfunc_begin0:
    .cfi_startproc
    .cfi_personality 155, ___gxx_personality_v0
    .cfi_lsda 16, Lexception0
## BB#0:
    pushq   %rbp
Lcfi0:
    .cfi_def_cfa_offset 16
Lcfi1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Lcfi2:
    .cfi_def_cfa_register %rbp
    pushq   %r15
    pushq   %r14
    pushq   %r13
    pushq   %r12
    pushq   %rbx
    subq    $328, %rsp              ## imm = 0x148
Lcfi3:
    .cfi_offset %rbx, -56
Lcfi4:
    .cfi_offset %r12, -48
Lcfi5:
    .cfi_offset %r13, -40
Lcfi6:
    .cfi_offset %r14, -32
Lcfi7:
    .cfi_offset %r15, -24
    leaq    -256(%rbp), %r14
    leaq    -360(%rbp), %r12
    movq    __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE@GOTPCREL(%rip), %rax
    leaq    24(%rax), %rcx
    movq    %rcx, -368(%rbp)
    addq    $64, %rax
    movq    %rax, -256(%rbp)
Ltmp0:
    movq    %r14, %rdi
    movq    %r12, %rsi
    callq   __ZNSt3__18ios_base4initEPv
Ltmp1:
## BB#1:
    movq    $0, -120(%rbp)
    movl    $-1, -112(%rbp)
    movq    __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rbx
    leaq    24(%rbx), %r13
    movq    %r13, -368(%rbp)
    addq    $64, %rbx
    movq    %rbx, -256(%rbp)
Ltmp3:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev
Ltmp4:
## BB#2:
    movq    __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %r15
    addq    $16, %r15
    movq    %r15, -360(%rbp)
    movq    $0, -272(%rbp)
    movq    $0, -280(%rbp)
    movq    $0, -288(%rbp)
    movq    $0, -296(%rbp)
    movl    $16, -264(%rbp)
    xorps   %xmm0, %xmm0
    movaps  %xmm0, -80(%rbp)
    movq    $0, -64(%rbp)
Ltmp6:
    leaq    -80(%rbp), %rsi
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE
Ltmp7:
## BB#3:
    testb   $1, -80(%rbp)
    je  LBB0_5
## BB#4:
    movq    -64(%rbp), %rdi
    callq   __ZdlPv
LBB0_5:
Ltmp9:
    leaq    L_.str(%rip), %rsi
    leaq    -368(%rbp), %rdi
    movl    $4, %edx
    callq   __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
Ltmp10:
## BB#6:
Ltmp11:
    movl    $42, %esi
    movq    %rax, %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi
Ltmp12:
## BB#7:
Ltmp13:
    leaq    -104(%rbp), %rdi
    movq    %r12, %rsi
    callq   __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv
Ltmp14:
## BB#8:
    testb   $1, -104(%rbp)
    je  LBB0_10
## BB#9:
    movq    -88(%rbp), %rdi
    callq   __ZdlPv
LBB0_10:
    movq    %r13, -368(%rbp)
    movq    %rbx, -256(%rbp)
    movq    %r15, -360(%rbp)
    testb   $1, -296(%rbp)
    je  LBB0_12
## BB#11:
    movq    -280(%rbp), %rdi
    callq   __ZdlPv
LBB0_12:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
    movq    __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rsi
    addq    $8, %rsi
    leaq    -368(%rbp), %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
    movq    %r14, %rdi
    callq   __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
    addq    $328, %rsp              ## imm = 0x148
    popq    %rbx
    popq    %r12
    popq    %r13
    popq    %r14
    popq    %r15
    popq    %rbp
    retq
LBB0_13:
Ltmp8:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    testb   $1, -80(%rbp)
    je  LBB0_18
## BB#14:
    movq    -64(%rbp), %rdi
    callq   __ZdlPv
    testb   $1, -296(%rbp)
    jne LBB0_19
    jmp LBB0_20
LBB0_16:
Ltmp5:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    jmp LBB0_21
LBB0_15:
Ltmp2:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    jmp LBB0_22
LBB0_17:
Ltmp15:
    movq    %rax, -48(%rbp)         ## 8-byte Spill
    movq    %r13, -368(%rbp)
    movq    %rbx, -256(%rbp)
    movq    %r15, -360(%rbp)
LBB0_18:
    testb   $1, -296(%rbp)
    je  LBB0_20
LBB0_19:
    movq    -280(%rbp), %rdi
    callq   __ZdlPv
LBB0_20:
    movq    %r12, %rdi
    callq   __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
LBB0_21:
    movq    __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE@GOTPCREL(%rip), %rsi
    addq    $8, %rsi
    leaq    -368(%rbp), %rdi
    callq   __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
LBB0_22:
    movq    %r14, %rdi
    callq   __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
    movq    -48(%rbp), %rdi         ## 8-byte Reload
    callq   __Unwind_Resume
Lfunc_end0:
    .cfi_endproc
    .section    __TEXT,__gcc_except_tab
    .p2align    2
GCC_except_table0:
Lexception0:
    .byte   255                     ## @LPStart Encoding = omit
    .byte   155                     ## @TType Encoding = indirect pcrel sdata4
    .asciz  "\303\200"              ## @TType base offset
    .byte   3                       ## Call site Encoding = udata4
    .byte   65                      ## Call site table length
Lset0 = Ltmp0-Lfunc_begin0              ## >> Call Site 1 <<
    .long   Lset0
Lset1 = Ltmp1-Ltmp0                     ##   Call between Ltmp0 and Ltmp1
    .long   Lset1
Lset2 = Ltmp2-Lfunc_begin0              ##     jumps to Ltmp2
    .long   Lset2
    .byte   0                       ##   On action: cleanup
Lset3 = Ltmp3-Lfunc_begin0              ## >> Call Site 2 <<
    .long   Lset3
Lset4 = Ltmp4-Ltmp3                     ##   Call between Ltmp3 and Ltmp4
    .long   Lset4
Lset5 = Ltmp5-Lfunc_begin0              ##     jumps to Ltmp5
    .long   Lset5
    .byte   0                       ##   On action: cleanup
Lset6 = Ltmp6-Lfunc_begin0              ## >> Call Site 3 <<
    .long   Lset6
Lset7 = Ltmp7-Ltmp6                     ##   Call between Ltmp6 and Ltmp7
    .long   Lset7
Lset8 = Ltmp8-Lfunc_begin0              ##     jumps to Ltmp8
    .long   Lset8
    .byte   0                       ##   On action: cleanup
Lset9 = Ltmp9-Lfunc_begin0              ## >> Call Site 4 <<
    .long   Lset9
Lset10 = Ltmp14-Ltmp9                   ##   Call between Ltmp9 and Ltmp14
    .long   Lset10
Lset11 = Ltmp15-Lfunc_begin0            ##     jumps to Ltmp15
    .long   Lset11
    .byte   0                       ##   On action: cleanup
Lset12 = Ltmp14-Lfunc_begin0            ## >> Call Site 5 <<
    .long   Lset12
Lset13 = Lfunc_end0-Ltmp14              ##   Call between Ltmp14 and Lfunc_end0
    .long   Lset13
    .long   0                       ##     has no landing pad
    .byte   0                       ##   On action: cleanup
    .p2align    2

score 2 · Accepted Answer

差异的最可能原因是 IOStream 实现是内联扩展的，而sprintf()使用只是一个函数调用。没有什么能从本质上阻止 IOStreams 由库实现。不过，它确实需要一点点抽象和规划：标准中的定义使用模板。这些通常只是内联实现。但是，将通常使用的实例化（对于字符类型char和wchar_t）声明为extern模板并显式实例化它们是额外的工作。我很久以前就展示了它确实在编译时间方面得到了回报，并且至少 libstdc++ 在库中预先实例化了 IOStreams 函数。根据您的实验，似乎 libc++ 没有。

c++ - 为什么简单使用 ostringstream 会生成这么多汇编代码？

1 回答 1

Related

Reference