正确答案几乎总是:
编写正确的代码,启用优化,相信你的编译器。
给定:
void count_values(std::array<uint32_t, 256^3>& results,
const unsigned char* from,
const unsigned char* to)
{
for(; from != to; from = std::next(from, 3)) {
++results[(*from << 16) | (*std::next(from, 1) << 8) | *(std::next(from,2))];
}
}
编译-O3
产量(内嵌解释性评论):
__Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
jmp LBB0_2
.align 4, 0x90
LBB0_1: ## %.lr.ph
## in Loop: Header=BB0_2 Depth=1
# dereference from and extend the 8-bit value to 32 bits
movzbl (%rsi), %eax
shlq $16, %rax # shift left 16
movzbl 1(%rsi), %ecx # dereference *(from+1) and extend to 32bits by padding with zeros
shlq $8, %rcx # shift left 8
orq %rax, %rcx # or into above result
movzbl 2(%rsi), %eax # dreference *(from+2) and extend to 32bits
orq %rcx, %rax # or into above result
incl (%rdi,%rax,4) # increment the correct counter
addq $3, %rsi # from += 3
LBB0_2: ## %.lr.ph
## =>This Inner Loop Header: Depth=1
cmpq %rdx, %rsi # while from != to
jne LBB0_1
## BB#3: ## %._crit_edge
popq %rbp
retq
.cfi_endproc
请注意,没有必要偏离标准构造或标准调用。编译器产生完美的代码。
为了进一步证明这一点,让我们疯狂地编写一个自定义迭代器,允许我们将函数简化为:
void count_values(std::array<uint32_t, 256^3>& results,
byte_triple_iterator from,
byte_triple_iterator to)
{
assert(iterators_correct(from, to));
while(from != to) {
++results[*from++];
}
}
这是这样一个迭代器的(基本)实现:
struct byte_triple_iterator
{
constexpr byte_triple_iterator(const std::uint8_t* p)
: _ptr(p)
{}
std::uint32_t operator*() const noexcept {
return (*_ptr << 16) | (*std::next(_ptr, 1) << 8) | *(std::next(_ptr,2));
}
byte_triple_iterator& operator++() noexcept {
_ptr = std::next(_ptr, 3);
return *this;
}
byte_triple_iterator operator++(int) noexcept {
auto copy = *this;
_ptr = std::next(_ptr, 3);
return copy;
}
constexpr const std::uint8_t* byte_ptr() const {
return _ptr;
}
private:
friend bool operator<(const byte_triple_iterator& from, const byte_triple_iterator& to)
{
return from._ptr < to._ptr;
}
friend bool operator==(const byte_triple_iterator& from, const byte_triple_iterator& to)
{
return from._ptr == to._ptr;
}
friend bool operator!=(const byte_triple_iterator& from, const byte_triple_iterator& to)
{
return not(from == to);
}
friend std::ptrdiff_t byte_difference(const byte_triple_iterator& from, const byte_triple_iterator& to)
{
return to._ptr - from._ptr;
}
const std::uint8_t* _ptr;
};
bool iterators_correct(const byte_triple_iterator& from,
const byte_triple_iterator& to)
{
if (not(from < to))
return false;
auto dist = to.byte_ptr() - from.byte_ptr();
return dist % 3 == 0;
}
现在我们有什么?
- 一个断言来检查我们的源代码确实是正确的长度(在调试版本中)
- 保证大小正确的输出结构
但是它对我们的目标代码做了什么?(用 编译-O3 -DNDEBUG
)
.globl __Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
.align 4, 0x90
__Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp5:
.cfi_def_cfa_register %rbp
jmp LBB1_2
.align 4, 0x90
LBB1_1: ## %.lr.ph
## in Loop: Header=BB1_2 Depth=1
movzbl (%rsi), %eax
shlq $16, %rax
movzbl 1(%rsi), %ecx
shlq $8, %rcx
orq %rax, %rcx
movzbl 2(%rsi), %eax
orq %rcx, %rax
incl (%rdi,%rax,4)
addq $3, %rsi
LBB1_2: ## %.lr.ph
## =>This Inner Loop Header: Depth=1
cmpq %rdx, %rsi
jne LBB1_1
## BB#3: ## %._crit_edge
popq %rbp
retq
.cfi_endproc
答案:什么都没有——它同样有效。
课程?真的没有!相信你的编译器!!!