c++ - 为什么来自对齐的 std::array 的初始自动矢量化负载是标量？(g++/clang++)

Question

我无法理解是什么阻止编译器在从std::array<uint64_t,...>读取数据时使用初始向量加载。

我知道 gcc 可以使用 -fopt-info-vec-* 生成调试信息。我无法从详细日志中找到任何可以表明为什么两个编译器都做出相同的次优决定来使用初始标量加载的内容。

另一方面，我不知道如何让 clang 提供有关矢量化问题的详细信息。-Rpass-analysis=loop-vectorize 仅报告 init 中的循环不值得交错。当然，我的内在版本证明循环可以向量化，但所需的转换可能过于复杂，除非来自编译器。

我当然可以使用内在函数实现热路径，但这需要为每个 cpu argitecture 复制相同的逻辑。我更喜欢编写编译器可以完美矢量化的标准 C++ 代码。使用 target_clones 属性或宏和 target 属性使用不同的标志多次编译相同的代码变得很简单。

如何让编译器告诉为什么负载无法矢量化？

我怀疑 gcc 可能已经打印了我只是不知道我在寻找什么的信息。

为什么自动矢量化在初始加载时失败？

    /**
     * This is a test case removing abstraction layers from my actual code. My
     * real code includes one extra problem that access to pack loses alignment
     * information wasn't only issue. Compilers still generate
     * suboptimal machine code with alignment information present. I fail to
     * understand why loads are treated differently compared to stores to
     * same address when auto-vectorization is used.
     *
     * I tested gcc 6.2 and clang 3.9
     * g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
     * clang++ -O3 -g -march=native vectest.cc -o vectest
     */


    #include <array>
    #include <cstdint>

    alignas(32) std::array<uint64_t, 52> pack;
    alignas(32) uint64_t board[4];

    __attribute__((noinline))
    static void init(uint64_t initial)
    {
        /* Clang seem to prefer large constant table and unrolled copy
         * which should perform worse outside micro benchmark. L1 misses
         * and memory bandwidth are bigger bottleneck than alu instruction
         * execution. But of course this code won't be compiled to hot path so
         * I don't care how it is compiled as long as it works correctly.
         *
         * But most interesting detail from clang is vectorized stores are
         * generated correctly like:
    4005db:       vpsllvq %ymm2,%ymm1,%ymm2
    4005e0:       vmovdqa %ymm2,0x200a78(%rip)        # 601060 <pack>
    4005e8:       vpaddq 0x390(%rip),%ymm0,%ymm2        # 400980 <_IO_stdin_used+0x60>
    4005f0:       vpsllvq %ymm2,%ymm1,%ymm2
    4005f5:       vmovdqa %ymm2,0x200a83(%rip)        # 601080 <pack+0x20>
    4005fd:       vpaddq 0x39b(%rip),%ymm0,%ymm2        # 4009a0 <_IO_stdin_used+0x80>
         *
         * gcc prefers scalar loop.
         */

        for (unsigned i = 0; i < pack.size(); i++) {
            pack[i] = 1UL << (i + initial);
        }
    }

    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_init(uint64_t initial)
    {
        /** Just an intrinsic implementation of init that would be IMO ideal
         * optimization.
         */
    #if __AVX2__
        unsigned i;
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        __m256i t = _mm256_set_epi64x(
                1UL << 3,
                1UL << 2,
                1UL << 1,
                1UL << 0
                );
        /* initial is just extra random number to prevent constant array
         * initialization
         */
        t = _mm256_slli_epi64(t, initial);
        for(i = 0; i < pack.size()/4; i++) {
            _mm256_store_si256(&conv.avx[i], t);
            t = _mm256_slli_epi64(t, 4);
        }
    #endif
    }

    __attribute__((noinline))
    static void iter_or()
    {
        /** initial load (clang):
    4006f0:       vmovaps 0x200988(%rip),%xmm0        # 601080 <pack+0x20>
    4006f8:       vorps  0x200960(%rip),%xmm0,%xmm0        # 601060 <pack>
    400700:       vmovaps 0x200988(%rip),%xmm1        # 601090 <pack+0x30>
    400708:       vorps  0x200960(%rip),%xmm1,%xmm1        # 601070 <pack+0x10>
    400710:       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
        * expected:
    400810:       vmovaps 0x200868(%rip),%ymm0        # 601080 <pack+0x20>
    400818:       vorps  0x200840(%rip),%ymm0,%ymm0        # 601060 <pack>
    400820:       vorps  0x200878(%rip),%ymm0,%ymm0        # 6010a0 <pack+0x40>
        */

        auto iter = pack.begin();
        uint64_t n(*iter++),
             e(*iter++),
             s(*iter++),
             w(*iter++);
        for (;iter != pack.end();) {
            n |= *iter++;
            e |= *iter++;
            s |= *iter++;
            w |= *iter++;
        }
        /** Store is correctly vectorized to single instruction */
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    __attribute__((noinline))
    static void index_or()
    {
        /** Clang compiles this to same as iterator variant. gcc goes
         * completely insane. I don't even want to try to guess what all the
         * permutation stuff is trying to archive.
         */
        unsigned i;
        uint64_t n(pack[0]),
             e(pack[1]),
             s(pack[2]),
             w(pack[3]);
        for (i = 4 ; i < pack.size(); i+=4) {
            n |= pack[i+0];
            e |= pack[i+1];
            s |= pack[i+2];
            w |= pack[i+3];
        }
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    #include "immintrin.h"

    __attribute__((noinline))
    static void expected_result()
    {
        /** Intrinsics implementation what I would expect auto-vectorization
         * transform my c++ code. I simple can't understand why both compilers
         * fails to archive results I expect.
         */
    #if __AVX2__
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        unsigned i;
        __m256i res = _mm256_load_si256(&conv.avx[0]);
        for (i = 1; i < pack.size()/4; i++) {
            __m256i temp = _mm256_load_si256(&conv.avx[i]);
            res = _mm256_or_si256(res, temp);
        }
        conv.mem = board;
        _mm256_store_si256(conv.avx, res);
    #endif
    }

    int main(int c, char **v)
    {
        (void)v;
        expected_init(c - 1);
        init(c - 1);

        iter_or();
        index_or();
        expected_result();
    }

score 1 · Accepted Answer

似乎 gcc 和 clang 都无法对来自外部循环的初始负载进行矢量化。如果首先将代码更改为零临时变量，然后使用或从第一个元素开始，则两个编译器都会做得更好。Clang 生成良好的展开向量代码（只有单个 ymm 寄存器是瓶颈，所有指令都依赖于前一个）。GCC 生成的代码有点糟糕，带有额外的初始 vpxor 和一个非常糟糕的循环，每次迭代执行一个 vpor。

我还测试了一些替代实现，其中最好的微基准测试是使用交替寄存器改进的clang展开代码。

/* only reduce (calling this function from a for loop):
 * ST 7.3 cycles (ST=single thread)
 * SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
 * shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
 * ST 222 cycles
 * SMT 383 cycles 
 */
    "vmovaps 0x00(%0), %%ymm0\n"
    "vmovaps 0x20(%0), %%ymm1\n"
    "vpor 0x40(%0), %%ymm0, %%ymm0\n"
    "vpor 0x60(%0), %%ymm1, %%ymm1\n"
    "vpor 0x80(%0), %%ymm0, %%ymm0\n"
    "vpor 0xA0(%0), %%ymm1, %%ymm1\n"
    "vpor 0xC0(%0), %%ymm0, %%ymm0\n"
    "vpor 0xE0(%0), %%ymm1, %%ymm1\n"
    "vpor 0x100(%0), %%ymm0, %%ymm0\n"
    "vpor 0x120(%0), %%ymm1, %%ymm1\n"
    "vpor 0x140(%0), %%ymm0, %%ymm0\n"
    "vpor 0x160(%0), %%ymm1, %%ymm1\n"
    "vpor 0x180(%0), %%ymm0, %%ymm0\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%1)\n"

Clang 展开循环的时间如下

/* only reduce:
 * ST 9.8 cycles
 * SMT 21.8 cycles
 * shuffle+reduce:
 * ST 223 cycles
 * SMT 385 cycles
 */

但是 SMT 降低展开代码性能的数字看起来很可疑。我决定尝试更好地编写仍然明显比展开慢的 GCC 循环。但后来我决定通过使用两个寄存器和一次展开循环来打破指令依赖关系。这导致 shuffle+reduce 代码比完全展开稍微快一些。

size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
 * This allows executing two vpor instructions same time and
 * reduces loop count to half with single unroll
 *
 * only reduce:
 * ST 13.0 cycles
 * SMT 20.0 cycles
 * shuffle+reduce:
 * ST 221 cycles
 * SMT 380 cycles
 */
    "vmovaps 0x180(%[pack]), %%ymm0\n"
    "vmovaps 0x160(%[pack]), %%ymm1\n"
    "vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "1:\n"
    "vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1\n"
    "vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "sub $8, %[cnt]\n"
    "jne 1b\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%[out])\n"
    : [cnt]"+r"(end)
    : [pack]"r"(begin), [out]"r"(hands_));

但是当代码在 Fisher-Yates shuffle 之后运行时，差异非常小。即使在仅减少基准（16.4/38.8）中明显丢失的 gcc 版本也以接近相同的速度（228/387）运行 shuffle+reduce 测试。

c++ - 为什么来自对齐的 std::array 的初始自动矢量化负载是标量？(g++/clang++)

1 回答 1

Related

Reference