6

我有以下代码使用标志与 GCC 一起编译,但问题是弹出计数仅获得转换类型-msse4的最后四个 8 位。__m128i基本上我想要的是计算__m128i类型内的所有 16 个数字,但我不确定在创建变量后要调用什么内部函数popA。必须以某种popA方式转换为包含所有 128 位信息的整数?我想theres_mm_cvtsi128_si64并使用了一些随机操作,但我的操作系统是32位的。是否只有 shuffle 方法和 using _mm_cvtsi128_si32

编辑:如果 shuffle 方法是我需要帮助为我的 32 位操作系统实现它的唯一选项,请。

这是代码。

#include <stdio.h>
#include <smmintrin.h>
#include <emmintrin.h>

int main(void)
{
    int A = 1;
    __m128i popA = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);

    unsigned int integer = _mm_cvtsi128_si32(popA);
    //long long LONG = _mm_cvtsi128_si64(popA);//my OS is 32-bits so no luck here

    printf("integer = %d\n", integer);
    int pop = _mm_popcnt_u32(integer);
    //int popLONG = _mm_popcnt_u64(LONG);
    printf("popcount = %d\n", pop);
    //printf("popcount LONG = %d\n", popLONG);

    return 0;
}

编辑 2-msse -msse2 -msse3 -msse4 :虽然我不确定输出pop_count1()是否正确,但这个最终运行(使用 GCC 编译器标志)。

输出: pop_count1(): 1799 1799 1799 1799 1799 1799 1799 1799

pop_count2():population count for each byte: 1 1 1 1 1 1 1 1 0 1 2 3 4 5 6 7

  #include <stdio.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdint.h>
#include <tmmintrin.h>

void print128_num(__m128i var)
{
    uint16_t *val = (uint16_t*) &var;
    printf("pop_count1(): %i %i %i %i %i %i %i %i \n",
           val[0], val[1], val[2], val[3], val[4], val[5],
           val[6], val[7]);
}
static __m128i parallelPopcnt16bytes (__m128i xmm)//for pop_count2
{
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
}
void pop_count1()
{
    int A = 1;
    __m128i in = _mm_set_epi8( A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A);
    __m128i bit0 = _mm_set1_epi8( 0x80 );
    __m128i mask0 = _mm_and_si128( in, bit0 );
    __m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );

/* general pattern */
    __m128i bit1 = _mm_set1_epi8( 0x40 );
    __m128i mask1 = _mm_and_si128( in, bit1 );
    mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask1 );

/* next bit */
    __m128i bit2 = _mm_set1_epi8( 0x20 );
    __m128i mask2 = _mm_and_si128( in, bit2 );
    mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask2 );

    __m128i bit3 = _mm_set1_epi8( 0x10 );
    __m128i mask3 = _mm_and_si128( in, bit3 );
    mask3 = _mm_cmpeq_epi8( mask3, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask3 );

    __m128i bit4 = _mm_set1_epi8( 0x08 );
    __m128i mask4 = _mm_and_si128( in, bit4 );
    mask4 = _mm_cmpeq_epi8( mask4, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask4 );

    __m128i bit5 = _mm_set1_epi8( 0x04 );
    __m128i mask5 = _mm_and_si128( in, bit5 );
    mask5 = _mm_cmpeq_epi8( mask5, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask5 );

    __m128i bit6 = _mm_set1_epi8( 0x02 );
    __m128i mask6 = _mm_and_si128( in, bit6 );
    mask6 = _mm_cmpeq_epi8( mask6, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask6 );

    __m128i bit7 = _mm_set1_epi8( 0x01 );
    __m128i mask7 = _mm_and_si128( in, bit7 );
    mask7 = _mm_cmpeq_epi8( mask7, _mm_setzero_si128() );
    sum = _mm_add_epi8( sum, mask7 );

/* finish up */
    sum = _mm_sub_epi8( _mm_setzero_si128(), sum );

    print128_num(sum);
}
void pop_count2()
{
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("pop_count2():population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
}
int main(void)
{
    pop_count1();
    pop_count2();

    return 0;
}
4

2 回答 2

10

16 个 8 位值的 SSE 4 popcount 可以通过这种方式并行完成:

#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

//----------------------------------------------------------------------------
//
// parallelPopcnt16bytes - find population count for 8-bit groups in xmm (16 groups)
//                         each byte of xmm result contains a value ranging from 0 to 8
//
static __m128i parallelPopcnt16bytes (__m128i xmm)
   {
    const __m128i mask4 = _mm_set1_epi8 (0x0F);
    const __m128i lookup = _mm_setr_epi8 (0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
   __m128i low, high, count;

   low = _mm_and_si128 (mask4, xmm);
   high = _mm_and_si128 (mask4, _mm_srli_epi16 (xmm, 4));
   count = _mm_add_epi8 (_mm_shuffle_epi8 (lookup, low), _mm_shuffle_epi8 (lookup, high));
   return count;
   }

//----------------------------------------------------------------------------

int main (void)
    {
    int index;
    __m128i testVector = _mm_set_epi8 (1, 2, 4, 8, 16, 32, 64, 128, 0, 1, 3, 7, 15, 31, 63, 127);
    __m128i counts = parallelPopcnt16bytes (testVector);

    printf ("population count for each byte:");
    for (index = 15; index >= 0; index--)
        {
        uint8_t *bytes = (void *) &counts;
        printf (" %d", bytes [index]);
        }
    printf ("\n");
    return 0;
    }

//----------------------------------------------------------------------------
于 2013-07-08T16:05:51.180 回答
2

popcnt与 SSE4.2 ISA 扩展同时引入,但不在 SSE 向量寄存器上运行。对于每个单独的结果,您将需要单独的说明。

此外,它没有为 8 位操作数定义。如果您需要对每个单独的字节进行计数,则需要填充到 16 位。

您可以一次在 64 位寄存器中求和 8 个字节,但这听起来不像您所追求的。

参考:SSE4 手册

SSE2 解决方案。

我没有对此进行测试,但是您可以将 SSE 注册为0x80808080... 以获得全 1 或全 0 的 16 字节掩码。对一个字节中的所有 8 位重复,并对掩码求和。由于所有 1 都表示二进制补码中的 -1,因此将 16 个字节取反,您将得到所有结果。

AND 和比较操作应该能够并行运行。加法链是依赖的,但它仍然应该运行得很快,它适合 32 条指令。(只需要添加 7 个。)

/* init */
__m128i bit0 = _mm_set1_epi8( 0x80 );
__m128i mask0 = _mm_and_si128( in, bit0 );
__m128i sum = _mm_cmpeq_epi8( mask0, _mm_setzero_si128() );

/* general pattern */
__m128i bit1 = _mm_set1_epi8( 0x40 );
__m128i mask1 = _mm_and_si128( in, bit1 );
mask1 = _mm_cmpeq_epi8( mask1, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask1 );

/* next bit */
__m128i bit2 = _mm_set1_epi8( 0x20 );
__m128i mask2 = _mm_and_si128( in, bit2 );
mask2 = _mm_cmpeq_epi8( mask2, _mm_setzero_si128() );
sum = _mm_add_epi8( sum, mask2 );

...

/* finish up */
sum = _mm_sub_epi8( _mm_setzero_si128(), sum );
于 2013-07-08T03:42:21.060 回答