通常,您可以通过按照标准算法实现算法来帮助自己和优化器。
例如:
#include <boost/iterator/zip_iterator.hpp>
void bar(int n, int * p, const int * a, const int * b)
{
auto source_begin = boost::make_zip_iterator(boost::make_tuple(a, b));
auto source_end = boost::make_zip_iterator(boost::make_tuple(a + n, b + n));
std::transform(source_begin, source_end, p, [](auto&& source) {
return boost::get<0>(source) * boost::get<1>(source);
});
}
哪个clang 3.9.1变成:
bar(int, int*, int const*, int const*): # @bar(int, int*, int const*, int const*)
... alignment stuff ...
.LBB0_7: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi]
vmovdqu ymmword ptr [rsi + 4*rdi], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 32]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 32]
vmovdqu ymmword ptr [rsi + 4*rdi + 32], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 64]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 64]
vmovdqu ymmword ptr [rsi + 4*rdi + 64], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 96]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 96]
vmovdqu ymmword ptr [rsi + 4*rdi + 96], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 128]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 128]
vmovdqu ymmword ptr [rsi + 4*rdi + 128], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 160]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 160]
vmovdqu ymmword ptr [rsi + 4*rdi + 160], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 192]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 192]
vmovdqu ymmword ptr [rsi + 4*rdi + 192], ymm0
vmovdqu ymm0, ymmword ptr [rcx + 4*rdi + 224]
vpmulld ymm0, ymm0, ymmword ptr [rdx + 4*rdi + 224]
vmovdqu ymmword ptr [rsi + 4*rdi + 224], ymm0
add rdi, 64
add rbx, 8
jne .LBB0_7
.LBB0_8:
test r14, r14
je .LBB0_11
lea rbx, [rdx + 4*rdi]
lea rax, [rcx + 4*rdi]
lea rdi, [rsi + 4*rdi]
neg r14
.LBB0_10: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rax]
vpmulld ymm0, ymm0, ymmword ptr [rbx]
vmovdqu ymmword ptr [rdi], ymm0
add rbx, 32
add rax, 32
add rdi, 32
add r14, 1
jne .LBB0_10
.LBB0_11:
cmp r8, r9
je .LBB0_16
lea rsi, [rsi + 4*r9]
lea rcx, [rcx + 4*r9]
lea rdx, [rdx + 4*r9]
.LBB0_13:
add rcx, 4
add rdx, 4
.LBB0_14: # =>This Inner Loop Header: Depth=1
mov rax, rdx
mov edx, dword ptr [rcx - 4]
imul edx, dword ptr [rax - 4]
mov dword ptr [rsi], edx
add rsi, 4
lea rdx, [rax + 4]
cmp r11, rcx
lea rcx, [rcx + 4]
jne .LBB0_14
cmp r10, rax
jne .LBB0_14
.LBB0_16:
pop rbx
pop r14
vzeroupper
ret
忽略对齐检查,我想你会同意编译器做得很好。
然而,gcc 似乎错过了这个机会。可能的缺陷?