我无法理解是什么阻止编译器在从std::array<uint64_t,...>读取数据时使用初始向量加载。
我知道 gcc 可以使用 -fopt-info-vec-* 生成调试信息。我无法从详细日志中找到任何可以表明为什么两个编译器都做出相同的次优决定来使用初始标量加载的内容。
另一方面,我不知道如何让 clang 提供有关矢量化问题的详细信息。-Rpass-analysis=loop-vectorize 仅报告 init 中的循环不值得交错。当然,我的内在版本证明循环可以向量化,但所需的转换可能过于复杂,除非来自编译器。
我当然可以使用内在函数实现热路径,但这需要为每个 cpu argitecture 复制相同的逻辑。我更喜欢编写编译器可以完美矢量化的标准 C++ 代码。使用 target_clones 属性或宏和 target 属性使用不同的标志多次编译相同的代码变得很简单。
如何让编译器告诉为什么负载无法矢量化?
我怀疑 gcc 可能已经打印了我只是不知道我在寻找什么的信息。
为什么自动矢量化在初始加载时失败?
/**
* This is a test case removing abstraction layers from my actual code. My
* real code includes one extra problem that access to pack loses alignment
* information wasn't only issue. Compilers still generate
* suboptimal machine code with alignment information present. I fail to
* understand why loads are treated differently compared to stores to
* same address when auto-vectorization is used.
*
* I tested gcc 6.2 and clang 3.9
* g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
* clang++ -O3 -g -march=native vectest.cc -o vectest
*/
#include <array>
#include <cstdint>
alignas(32) std::array<uint64_t, 52> pack;
alignas(32) uint64_t board[4];
__attribute__((noinline))
static void init(uint64_t initial)
{
/* Clang seem to prefer large constant table and unrolled copy
* which should perform worse outside micro benchmark. L1 misses
* and memory bandwidth are bigger bottleneck than alu instruction
* execution. But of course this code won't be compiled to hot path so
* I don't care how it is compiled as long as it works correctly.
*
* But most interesting detail from clang is vectorized stores are
* generated correctly like:
4005db: vpsllvq %ymm2,%ymm1,%ymm2
4005e0: vmovdqa %ymm2,0x200a78(%rip) # 601060 <pack>
4005e8: vpaddq 0x390(%rip),%ymm0,%ymm2 # 400980 <_IO_stdin_used+0x60>
4005f0: vpsllvq %ymm2,%ymm1,%ymm2
4005f5: vmovdqa %ymm2,0x200a83(%rip) # 601080 <pack+0x20>
4005fd: vpaddq 0x39b(%rip),%ymm0,%ymm2 # 4009a0 <_IO_stdin_used+0x80>
*
* gcc prefers scalar loop.
*/
for (unsigned i = 0; i < pack.size(); i++) {
pack[i] = 1UL << (i + initial);
}
}
#include "immintrin.h"
__attribute__((noinline))
static void expected_init(uint64_t initial)
{
/** Just an intrinsic implementation of init that would be IMO ideal
* optimization.
*/
#if __AVX2__
unsigned i;
union {
uint64_t *mem;
__m256i *avx;
} conv;
conv.mem = &pack[0];
__m256i t = _mm256_set_epi64x(
1UL << 3,
1UL << 2,
1UL << 1,
1UL << 0
);
/* initial is just extra random number to prevent constant array
* initialization
*/
t = _mm256_slli_epi64(t, initial);
for(i = 0; i < pack.size()/4; i++) {
_mm256_store_si256(&conv.avx[i], t);
t = _mm256_slli_epi64(t, 4);
}
#endif
}
__attribute__((noinline))
static void iter_or()
{
/** initial load (clang):
4006f0: vmovaps 0x200988(%rip),%xmm0 # 601080 <pack+0x20>
4006f8: vorps 0x200960(%rip),%xmm0,%xmm0 # 601060 <pack>
400700: vmovaps 0x200988(%rip),%xmm1 # 601090 <pack+0x30>
400708: vorps 0x200960(%rip),%xmm1,%xmm1 # 601070 <pack+0x10>
400710: vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
* expected:
400810: vmovaps 0x200868(%rip),%ymm0 # 601080 <pack+0x20>
400818: vorps 0x200840(%rip),%ymm0,%ymm0 # 601060 <pack>
400820: vorps 0x200878(%rip),%ymm0,%ymm0 # 6010a0 <pack+0x40>
*/
auto iter = pack.begin();
uint64_t n(*iter++),
e(*iter++),
s(*iter++),
w(*iter++);
for (;iter != pack.end();) {
n |= *iter++;
e |= *iter++;
s |= *iter++;
w |= *iter++;
}
/** Store is correctly vectorized to single instruction */
board[0] = n;
board[1] = e;
board[2] = s;
board[3] = w;
}
__attribute__((noinline))
static void index_or()
{
/** Clang compiles this to same as iterator variant. gcc goes
* completely insane. I don't even want to try to guess what all the
* permutation stuff is trying to archive.
*/
unsigned i;
uint64_t n(pack[0]),
e(pack[1]),
s(pack[2]),
w(pack[3]);
for (i = 4 ; i < pack.size(); i+=4) {
n |= pack[i+0];
e |= pack[i+1];
s |= pack[i+2];
w |= pack[i+3];
}
board[0] = n;
board[1] = e;
board[2] = s;
board[3] = w;
}
#include "immintrin.h"
__attribute__((noinline))
static void expected_result()
{
/** Intrinsics implementation what I would expect auto-vectorization
* transform my c++ code. I simple can't understand why both compilers
* fails to archive results I expect.
*/
#if __AVX2__
union {
uint64_t *mem;
__m256i *avx;
} conv;
conv.mem = &pack[0];
unsigned i;
__m256i res = _mm256_load_si256(&conv.avx[0]);
for (i = 1; i < pack.size()/4; i++) {
__m256i temp = _mm256_load_si256(&conv.avx[i]);
res = _mm256_or_si256(res, temp);
}
conv.mem = board;
_mm256_store_si256(conv.avx, res);
#endif
}
int main(int c, char **v)
{
(void)v;
expected_init(c - 1);
init(c - 1);
iter_or();
index_or();
expected_result();
}