c++11 - 将最佳 uint8_t 位图转换为 8 x 32 位 SIMD“布尔”向量

Question

作为压缩算法的一部分，我正在寻找实现以下目标的最佳方法：

我有一个简单的位图uint8_t。例如 01010011

我想要的是一个__m256i形式： (0, maxint, 0, maxint, 0, 0, maxint, maxint)

实现此目的的一种方法是将 8 x maxint 的向量改组为零向量。但这首先需要我将我的扩展uint8_t为正确的随机播放位图。

我想知道是否有更好的方法？

score 4 · Accepted Answer

我想我最初可能会采用“蛮力和无知”的方法，也许是这样的：

uint8_t u = 0x53; // 01010011

const union {
    uint32_t a[4];
    __m128i v;
} kLUT[16] = { { {  0,  0,  0,  0 } },
               { { -1,  0,  0,  0 } },
               { {  0, -1,  0,  0 } },
               { { -1, -1,  0,  0 } },
               { {  0,  0, -1,  0 } },
               { { -1,  0, -1,  0 } },
               { {  0, -1, -1,  0 } },
               { { -1, -1, -1,  0 } },
               { {  0,  0,  0, -1 } },
               { { -1,  0,  0, -1 } },
               { {  0, -1,  0, -1 } },
               { { -1, -1,  0, -1 } },
               { {  0,  0, -1, -1 } },
               { { -1,  0, -1, -1 } },
               { {  0, -1, -1, -1 } },
               { { -1, -1, -1, -1 } } };
__m256i v = _mm256_set_m128i(kLUT[u >> 4].v, kLUT[u & 15].v);

使用clang -O3它编译为：

movl    %ebx, %eax                ;; eax = ebx = u
andl    $15, %eax                 ;; get low offset = (u & 15) * 16
shlq    $4, %rax
leaq    _main.kLUT(%rip), %rcx    ;; rcx = kLUT
vmovaps (%rax,%rcx), %xmm0        ;; load low half of ymm0 from kLUT
andl    $240, %ebx                ;; get high offset = (u >> 4) * 16
vinsertf128 $1, (%rbx,%rcx), %ymm0, %ymm0
                                  ;; load high half of ymm0 from kLUT

FWIW我为三个实现组合了一个简单的测试工具：（i）一个简单的标量代码参考实现，（ii）上面的代码，（iii）基于@Zboson的答案的实现，（iv）稍微改进的版本（iii ) 和 (v) 使用 @MarcGlisse 的建议对 (iv) 进行了进一步改进。我使用 2.6GHz Haswell CPU（用编译）得到以下结果clang -O3：

scalar code:                                 7.55336 ns / vector
Paul R:                                      1.36016 ns / vector
Z boson:                                     1.24863 ns / vector
Z boson (improved):                          1.07590 ns / vector
Z boson (improved + @MarcGlisse suggestion): 1.08195 ns / vector

所以@Zboson 的解决方案赢了大约 10% - 20%，大概是因为他们只需要 1 个负载，而我的需要 2 个。

如果我们得到任何其他实现，我会将它们添加到测试工具中并更新结果。

@Zboson 实现的略微改进版本：

__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
v = _mm256_xor_si256(v, mask);
return _mm256_cmpeq_epi32(v, _mm256_setzero_si256());

@Zboson 实施的进一步改进版本，结合了@MarcGlisse 的建议：

__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
return _mm256_cmpeq_epi32(v, mask);

（注意mask需要在每个 32 位元素中包含复制的 8 位值，即0x01010101, 0x02020202, ..., 0x80808080）

score 2 · Accepted Answer

这是一个解决方案（PaulR 改进了我的解决方案，请参阅我的答案或他的答案的结尾）基于这个问题的变体最快方式到广播 32 位在 32 字节中。

__m256i t1 = _mm256_set1_epi8(x);
__m256i t2 = _mm256_and_si256(t1, mask);
__m256i t4 = _mm256_cmpeq_epi32(t2, _mm256_setzero_si256());
t4 = _mm256_xor_si256(t4, _mm256_set1_epi32(-1));

我现在没有 AVX2 硬件来测试它，但这里有一个 SSE2 版本，显示它可以工作，它还显示了如何定义掩码。

#include <x86intrin.h>
#include <stdint.h>
#include <stdio.h>

int main(void) {
    char mask[32] = {
        0x01, 0x00, 0x00, 0x00,
        0x02, 0x00, 0x00, 0x00,
        0x04, 0x00, 0x00, 0x00,
        0x08, 0x00, 0x00, 0x00,
        0x10, 0x00, 0x00, 0x00,
        0x20, 0x00, 0x00, 0x00,
        0x40, 0x00, 0x00, 0x00,
        0x80, 0x00, 0x00, 0x00,
    };
    __m128i mask1 = _mm_loadu_si128((__m128i*)&mask[ 0]);
    __m128i mask2 = _mm_loadu_si128((__m128i*)&mask[16]);

    uint8_t x = 0x53; //0101 0011
    __m128i t1 = _mm_set1_epi8(x);
    __m128i t2 = _mm_and_si128(t1, mask1);
    __m128i t3 = _mm_and_si128(t1, mask2);
    __m128i t4 = _mm_cmpeq_epi32(t2,_mm_setzero_si128());
    __m128i t5 = _mm_cmpeq_epi32(t3,_mm_setzero_si128());
    t4 = _mm_xor_si128(t4, _mm_set1_epi32(-1));
    t5 = _mm_xor_si128(t5, _mm_set1_epi32(-1));

    int o1[4], o2[4];
    _mm_store_si128((__m128i*)o1, t4);
    _mm_store_si128((__m128i*)o2, t5);
    for(int i=0; i<4; i++) printf("%d \n", o1[i]);
    for(int i=0; i<4; i++) printf("%d \n", o2[i]);

}

编辑：

PaulR 改进了我的解决方案

__m256i v = _mm256_set1_epi8(u);
v = _mm256_and_si256(v, mask);
v = _mm256_xor_si256(v, mask);
return _mm256_cmpeq_epi32(v, _mm256_setzero_si256());

掩码定义为

int mask[8] = {
    0x01010101, 0x02020202, 0x04040404, 0x08080808,
    0x10101010, 0x20202020, 0x40404040, 0x80808080,
};

有关更多详细信息，请参阅他对性能测试的回答。

score 1 · Accepted Answer

根据所有答案，我使用 Agner Fog 的优秀库（它处理具有通用抽象的 AVX2、AVX 和 SSE 解决方案）编写了一个解决方案。想我会分享它作为替代答案：

// Used to generate 32 bit vector bitmasks from 8 bit ints
static const Vec8ui VecBitMask8(
      0x01010101
    , 0x02020202
    , 0x04040404
    , 0x08080808
    , 0x10101010
    , 0x20202020
    , 0x40404040
    , 0x80808080);

// As above, but for 64 bit vectors and 4 bit ints
static const Vec4uq VecBitMask4(
      0x0101010101010101
    , 0x0202020202020202
    , 0x0404040404040404
    , 0x0808080808080808);

template <typename V>
inline static Vec32c getBitmapMask();

template <> inline Vec32c getBitmapMask<Vec8ui>() {return VecBitMask8;};
template <> inline Vec32c getBitmapMask<Vec8i>() {return VecBitMask8;};
template <> inline Vec32c getBitmapMask<Vec4uq>() {return VecBitMask4;};
template <> inline Vec32c getBitmapMask<Vec4q>() {return VecBitMask4;};

// Returns a bool vector representing the bitmask passed.
template <typename V>
static inline V getBitmap(const uint8_t bitMask) {
    Vec32c mask = getBitmapMask<V>();
    Vec32c v1(bitMask);
    v1 = v1 & mask;
    return ((V)v1 == (V)mask);
}

c++11 - 将最佳 uint8_t 位图转换为 8 x 32 位 SIMD“布尔”向量

3 回答 3

Related

Reference