c - C语言中最快的去交错操作？

Question

我有一个指向字节数组的指针，mixed其中包含两个不同数组的交错字节array1和array2. 说mixed看起来像这样：

a1b2c3d4...

我需要做的是对字节进行去交错处理，以便得到array1 = abcd...and array2 = 1234...。mixed我提前知道了的长度，array1和的长度array2是等价的，都等于mixed / 2。

这是我当前的实现（array1并且array2已经分配）：

int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
    array1[i] = mixed[j];
    array2[i] = mixed[j+1];
}

这避免了任何昂贵的乘法或除法运算，但仍然不够快。我希望有类似的东西memcpy需要一个索引器，它可以使用低级块复制操作来加速这个过程。有没有比我目前拥有的更快的实现？

编辑

目标平台是 iOS 和 Mac 的 Objective-C。对于 iOS 设备而言，快速操作更为重要，因此专门针对 iOS 的解决方案总比没有好。

更新

感谢大家的回复，尤其是 Stephen Canon、Graham Lee 和 Mecki。这是我的“主”函数，如果可用，它使用斯蒂芬的 NEON 内在函数，否则使用格雷厄姆的联合游标，如 Mecki 所建议的那样减少迭代次数。

void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t dstABLength_32 = div(dstABLength, 32);
    if (dstABLength_32.rem == 0)
    {
        while (dstABLength_32.quot --> 0)
        {
            const uint8x16_t a = vld1q_u8(srcA);
            const uint8x16_t b = vld1q_u8(srcB);
            const uint8x16x2_t ab = { a, b };
            vst2q_u8(dstAB, ab);
            srcA += 16;
            srcB += 16;
            dstAB += 32;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t dstABLength_16 = div(dstABLength, 16);
    if (dstABLength_16.rem == 0)
    {
        while (dstABLength_16.quot --> 0)
        {
            const uint8x8_t a = vld1_u8(srcA);
            const uint8x8_t b = vld1_u8(srcB);
            const uint8x8x2_t ab = { a, b };
            vst2_u8(dstAB, ab);
            srcA += 8;
            srcB += 8;
            dstAB += 16;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t dstABLength_8 = div(dstABLength, 8);
    if (dstABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *dstAB64 = (uint64_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            cursor.narrow.a3 = srcA[j  ];
            cursor.narrow.b3 = srcB[j++];
            cursor.narrow.a4 = srcA[j  ];
            cursor.narrow.b4 = srcB[j++];
            dstAB64[i] = cursor.wide;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t dstABLength_4 = div(dstABLength, 4);
    if (dstABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *dstAB32 = (uint32_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            dstAB32[i] = cursor.wide;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t dstABLength_2 = div(dstABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *dstAB16 = (uint16_t *)dstAB;
    for (int i = 0; i < dstABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.narrow.a = srcA[i];
        cursor.narrow.b = srcB[i];
        dstAB16[i] = cursor.wide;
    }
}

void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t srcABLength_32 = div(srcABLength, 32);
    if (srcABLength_32.rem == 0)
    {
        while (srcABLength_32.quot --> 0)
        {
            const uint8x16x2_t ab = vld2q_u8(srcAB);
            vst1q_u8(dstA, ab.val[0]);
            vst1q_u8(dstB, ab.val[1]);
            srcAB += 32;
            dstA += 16;
            dstB += 16;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t srcABLength_16 = div(srcABLength, 16);
    if (srcABLength_16.rem == 0)
    {
        while (srcABLength_16.quot --> 0)
        {
            const uint8x8x2_t ab = vld2_u8(srcAB);
            vst1_u8(dstA, ab.val[0]);
            vst1_u8(dstB, ab.val[1]);
            srcAB += 16;
            dstA += 8;
            dstB += 8;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t srcABLength_8 = div(srcABLength, 8);
    if (srcABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *srcAB64 = (uint64_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.wide = srcAB64[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
            dstA[j  ] = cursor.narrow.a3;
            dstB[j++] = cursor.narrow.b3;
            dstA[j  ] = cursor.narrow.a4;
            dstB[j++] = cursor.narrow.b4;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t srcABLength_4 = div(srcABLength, 4);
    if (srcABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *srcAB32 = (uint32_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.wide = srcAB32[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t srcABLength_2 = div(srcABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *srcAB16 = (uint16_t *)srcAB;
    for (int i = 0; i < srcABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.wide = srcAB16[i];
        dstA[i] = cursor.narrow.a;
        dstB[i] = cursor.narrow.b;
    }
}

score 10 · Accepted Answer

在我的脑海中，我不知道用于解交织 2 通道字节数据的库函数。但是，值得向 Apple 提交错误报告以请求此类功能。

同时，使用 NEON 或 SSE 内在函数对此类函数进行矢量化非常容易。具体来说，在 ARM 上，您将希望使用vld1q_u8从每个源数组加载一个向量，vuzpq_u8对它们进行去交错，并vst1q_u8存储结果向量；这是我没有测试甚至尝试构建的粗略草图，但它应该说明总体思路。更复杂的实现肯定是可能的（特别是，NEON 可以在单个指令中加载/存储两个16B 寄存器，编译器可能不会这样做，并且根据缓冲区的长度，一些流水线和/或展开可能是有益的是）：

#if defined __ARM_NEON__
#   include <arm_neon.h>
#endif
#include <stdint.h>
#include <stddef.h>

void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) {
#if defined __ARM_NEON__
    size_t vectors = mixedLength / 32;
    mixedLength %= 32;
    while (vectors --> 0) {
        const uint8x16_t src0 = vld1q_u8(mixed);
        const uint8x16_t src1 = vld1q_u8(mixed + 16);
        const uint8x16x2_t dst = vuzpq_u8(src0, src1);
        vst1q_u8(array1, dst.val[0]);
        vst1q_u8(array2, dst.val[1]);
        mixed += 32;
        array1 += 16;
        array2 += 16;
    }
#endif
    for (size_t i=0; i<mixedLength/2; ++i) {
        array1[i] = mixed[2*i];
        array2[i] = mixed[2*i + 1];
    }
}

score 3 · Accepted Answer

我只对此进行了轻微测试，但它似乎至少是您的版本的两倍：

typedef union {
uint16_t wide;
struct { uint8_t top; uint8_t bottom; } narrow;
} my_union;

uint16_t *source = (uint16_t *)mixed;
for (int i = 0; i < mixedLength/2; i++)
{
    my_union cursor;
    cursor.wide = source[i];
    array1[i] = cursor.narrow.top;
    array2[i] = cursor.narrow.bottom;
}

请注意，我对结构打包并不小心，但在这种情况下，在这种架构上这不是问题。另请注意，有人可能会抱怨我选择的命名top和bottom；我假设你知道你需要哪个整数的哪一半。

score 2 · Accepted Answer

好的，这是您的原始方法：

static void simpleDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i, j;
    int mixedLength_2 = mixedLength / 2;
    for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
    {
        array1[i] = mixed[j];
        array2[i] = mixed[j+1];
    }
}

拥有 1000 万个条目并且-O3（编译器应针对最大速度进行优化），我可以在我的 Mac 上每秒运行 154 次。

这是我的第一个建议：

static void structDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i;
    int len;
    uint8_t * array1Ptr = (uint8_t *)array1;
    uint8_t * array2Ptr = (uint8_t *)array2;
    struct {
        uint8_t byte1;
        uint8_t byte2;
    } * tb = (void *)mixed;

    len = mixedLength / 2;
    for (i = 0; i < len; i++) {
      *(array1Ptr++) = tb->byte1;
      *(array2Ptr++) = tb->byte2;
      tb++;
    }
}

与以前相同的计数和优化，我每秒运行 193 次。

现在来自 Graham Lee 的建议：

static void unionDeint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    union my_union {
        uint16_t wide;
        struct { uint8_t top; uint8_t bottom; } narrow;
    };

    uint16_t * source = (uint16_t *)mixed;
    for (int i = 0; i < mixedLength/2; i++) {
        union my_union cursor;
        cursor.wide = source[i];
        array1[i] = cursor.narrow.top;
        array2[i] = cursor.narrow.bottom;
    }
}

与以前相同的设置，每秒运行 198 次（注意：此方法不安全，结果取决于 CPU 字节序。在您的情况下，array1 和 array2 可能已交换，因为 ARM 是小字节序，因此您必须在代码中交换它们）。

这是我迄今为止最好的一个：

static void uint32Deint (
    uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
    int i;
    int count;
    uint32_t * fourBytes = (void *)mixed;
    uint8_t * array1Ptr = (uint8_t *)array1;
    uint8_t * array2Ptr = (uint8_t *)array2;


    count = mixedLength / 4;
    for (i = 0; i < count; i++) {
        uint32_t temp = *(fourBytes++);

#if __LITTLE_ENDIAN__
        *(array1Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array2Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array1Ptr++) = (uint8_t)(temp & 0xFF);
        temp >>= 8;
        *(array2Ptr++) = tb->byte2;

#else
        *(array1Ptr++) = (uint8_t)(temp >> 24);
        *(array2Ptr++) = (uint8_t)((temp >> 16) & 0xFF);
        *(array1Ptr++) = (uint8_t)((temp >>  8) & 0xFF);
        *(array2Ptr++) = (uint8_t)(temp & 0xFF);
#endif
    }
    // Either it is a multiple of 4 or a multiple of 2.
    // If it is a multiple of 2, 2 bytes are left over.
    if (count * 4 != mixedLength) {
        *(array1Ptr) = mixed[mixedLength - 2];
        *(array2Ptr) = mixed[mixedLength - 1];
    }
}

与上面相同的设置，每秒 219 次，除非我犯了错误，否则应该可以使用任何一种字节序。

score 1 · Accepted Answer

我推荐 Graham 的解决方案，但如果这对速度非常关键并且您愿意使用 Assembler，那么您可以变得更快。

这个想法是这样的：

从中读取整个 32 位整数mixed。你会得到'a1b2'。
将低 16 位旋转 8 位得到 '1ab2'（我们使用小端序，因为这是 ARM 和 Apple A# 中的默认值，所以前两个字节是低字节）。
将整个 32 位寄存器向右旋转 8 位（我认为它是正确的......）以获得“21ab”。
将低 16 位旋转 8 位得到 '12ab'
将低 8 位写入array2.
将整个 32 位寄存器循环 16 位。
将低 8 位写入array1
前进array116 位、array216 位和mixed32 位。
重复。

我们已经交换了 2 次内存读取（假设我们使用 Graham 版本或等效版本）和 4 次内存，其中一次内存读取、两次内存写入和 4 次寄存器操作。虽然操作的数量从 6 个增加到 7 个，但寄存器操作比内存操作快，所以这样更有效。此外，由于我们一次读取mixed32 位而不是 16 位，因此我们将迭代管理减少了一半。

PS：理论上这也可以用于 64 位架构，但是为 'a1b2c3d4' 做所有这些旋转会让你发疯。

score 1 · Accepted Answer

对于 x86 SSE，pack和punpck说明是您所需要的。使用 AVX 以方便非破坏性 3 操作数指令的示例。（不使用 AVX2 256b-wide 指令，因为 256b pack/unpck 指令在低和高 128b 通道中执行两个 128b 解包，因此您需要洗牌才能以正确的最终顺序得到东西。）

以下的内在函数版本将起作用。Asm 指令的输入时间较短，仅用于编写快速答案。

交错:abcd和1234-> a1b2c3d4:

# loop body:
vmovdqu    (%rax), %xmm0  # load the sources
vmovdqu    (%rbx), %xmm1
vpunpcklbw %xmm0, %xmm1, %xmm2  # low  halves -> 128b reg
vpunpckhbw %xmm0, %xmm2, %xmm3  # high halves -> 128b reg
vmovdqu    %xmm2, (%rdi)   # store the results
vmovdqu    %xmm3, 16(%rdi)
# blah blah some loop structure.

`punpcklbw` interleaves the bytes in the low 64 of the two source `xmm` registers.  There are `..wd` (word->dword), and dword->qword versions which would be useful for 16 or 32bit elements.

去交错：a1b2c3d4->abcd和1234

#outside the loop
vpcmpeqb    %xmm5, %xmm5   # set to all-1s
vpsrlw     $8, %xmm5, %xmm5   # every 16b word has low 8b = 0xFF, high 8b = 0.

# loop body
vmovdqu    (%rsi), %xmm2     # load two src chunks
vmovdqu    16(%rsi), %xmm3
vpand      %xmm2, %xmm5, %xmm0  # mask to leave only the odd bytes
vpand      %xmm3, %xmm5, %xmm1
vpackuswb  %xmm0, %xmm1, %xmm4
vmovdqu    %xmm4, (%rax)    # store 16B of a[]
vpsrlw     $8, %xmm2, %xmm6     # even bytes -> odd bytes
vpsrlw     $8, %xmm3, %xmm7
vpackuswb  %xmm6, %xmm7, %xmm4
vmovdqu    %xmm4, (%rbx)

这当然可以使用更少的寄存器。我避免重用寄存器以提高可读性，而不是性能。只要您从不依赖于先前值的东西开始，硬件寄存器重命名就不会导致重用。（例如movd，不是movss或pinsrd。）

解交织的工作量要大得多，因为pack指令会进行有符号或无符号饱和，因此每个 16b 元素的高 8b 必须首先归零。

另一种方法是使用pshufb将单个源 reg 的奇数或偶数字打包到寄存器的低 64 位。但是，在 AMD XOP 指令集之外VPPERM，没有可以同时从 2 个寄存器中选择字节的 shuffle（就像 Altivec 的备受喜爱的vperm）。因此，仅使用 SSE/AVX，每 128b 的交错数据需要 2 次随机播放。并且由于存储端口的使用可能是瓶颈，apunpck将两个 64 位块组合a到一个寄存器中以建立一个 128b 存储。

使用 AMD XOP，去交错将是 2x128b 加载、2VPPERM和 2x128b 存储。

score -1 · Accepted Answer

过早优化不好
您的编译器可能比您更擅长优化。

也就是说，您可以做一些事情来帮助编译器，因为您拥有编译器无法拥有的数据语义知识：

尽可能多地读取和写入字节，直到本机字大小 - 内存操作很昂贵，因此尽可能在寄存器中进行操作
展开循环 - 查看“Duff 的设备”。

FWIW，我制作了您的复制循环的两个版本，一个与您的大致相同，第二个使用大多数人认为“最佳”（尽管仍然很简单）的 C 代码：

void test1(byte *p, byte *p1, byte *p2, int n)
{
    int i, j;
    for (i = 0, j = 0; i < n / 2; i++, j += 2) {
        p1[i] = p[j];
        p2[i] = p[j + 1];
    }
}

void test2(byte *p, byte *p1, byte *p2, int n)
{
    while (n) {
        *p1++ = *p++;
        *p2++ = *p++;
        n--; n--;
    }
}

在gcc -O3 -SIntel x86 上，它们都产生了几乎相同的汇编代码。这是内部循环：

LBB1_2:
    movb    -1(%rdi), %al
    movb    %al, (%rsi)
    movb    (%rdi), %al
    movb    %al, (%rdx)
    incq    %rsi
    addq    $2, %rdi
    incq    %rdx
    decq    %rcx
    jne LBB1_2

和

LBB2_2:
    movb    -1(%rdi), %al
    movb    %al, (%rsi)
    movb    (%rdi), %al
    movb    %al, (%rdx)
    incq    %rsi
    addq    $2, %rdi
    incq    %rdx
    addl    $-2, %ecx
    jne LBB2_2

两者具有相同数量的指令，差异仅是因为第一个版本计数为n / 2，而第二个版本计数为零。

编辑这里是一个更好的版本：

/* non-portable - assumes little endian */
void test3(byte *p, byte *p1, byte *p2, int n)
{
    ushort *ps = (ushort *)p;

    n /= 2;
    while (n) {
        ushort n = *ps++;
        *p1++ = n;
        *p2++ = n >> 8;
    }
}

导致：

LBB3_2:
    movzwl  (%rdi), %ecx
    movb    %cl, (%rsi)
    movb    %ch, (%rdx)  # NOREX
    addq    $2, %rdi
    incq    %rsi
    incq    %rdx
    decq    %rax
    jne LBB3_2

这是少了一条指令，因为它利用了对%cland的立即访问%ch。

c - C语言中最快的去交错操作？

6 回答 6

Related

Reference