我有一个指向字节数组的指针,mixed
其中包含两个不同数组的交错字节array1
和array2
. 说mixed
看起来像这样:
a1b2c3d4...
我需要做的是对字节进行去交错处理,以便得到array1 = abcd...
and array2 = 1234...
。mixed
我提前知道了 的长度,array1
和的长度array2
是等价的,都等于mixed / 2
。
这是我当前的实现(array1
并且array2
已经分配):
int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
array1[i] = mixed[j];
array2[i] = mixed[j+1];
}
这避免了任何昂贵的乘法或除法运算,但仍然不够快。我希望有类似的东西memcpy
需要一个索引器,它可以使用低级块复制操作来加速这个过程。有没有比我目前拥有的更快的实现?
编辑
目标平台是 iOS 和 Mac 的 Objective-C。对于 iOS 设备而言,快速操作更为重要,因此专门针对 iOS 的解决方案总比没有好。
更新
感谢大家的回复,尤其是 Stephen Canon、Graham Lee 和 Mecki。这是我的“主”函数,如果可用,它使用斯蒂芬的 NEON 内在函数,否则使用格雷厄姆的联合游标,如 Mecki 所建议的那样减少迭代次数。
void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics
// iterate 32-bytes at a time
div_t dstABLength_32 = div(dstABLength, 32);
if (dstABLength_32.rem == 0)
{
while (dstABLength_32.quot --> 0)
{
const uint8x16_t a = vld1q_u8(srcA);
const uint8x16_t b = vld1q_u8(srcB);
const uint8x16x2_t ab = { a, b };
vst2q_u8(dstAB, ab);
srcA += 16;
srcB += 16;
dstAB += 32;
}
return;
}
// iterate 16-bytes at a time
div_t dstABLength_16 = div(dstABLength, 16);
if (dstABLength_16.rem == 0)
{
while (dstABLength_16.quot --> 0)
{
const uint8x8_t a = vld1_u8(srcA);
const uint8x8_t b = vld1_u8(srcB);
const uint8x8x2_t ab = { a, b };
vst2_u8(dstAB, ab);
srcA += 8;
srcB += 8;
dstAB += 16;
}
return;
}
#endif
// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration
// iterate 8-bytes at a time
div_t dstABLength_8 = div(dstABLength, 8);
if (dstABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;
uint64_t *dstAB64 = (uint64_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
cursor.narrow.a3 = srcA[j ];
cursor.narrow.b3 = srcB[j++];
cursor.narrow.a4 = srcA[j ];
cursor.narrow.b4 = srcB[j++];
dstAB64[i] = cursor.wide;
}
return;
}
// iterate 4-bytes at a time
div_t dstABLength_4 = div(dstABLength, 4);
if (dstABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;
uint32_t *dstAB32 = (uint32_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
dstAB32[i] = cursor.wide;
}
return;
}
// iterate 2-bytes at a time
div_t dstABLength_2 = div(dstABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;
uint16_t *dstAB16 = (uint16_t *)dstAB;
for (int i = 0; i < dstABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.narrow.a = srcA[i];
cursor.narrow.b = srcB[i];
dstAB16[i] = cursor.wide;
}
}
void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics
// iterate 32-bytes at a time
div_t srcABLength_32 = div(srcABLength, 32);
if (srcABLength_32.rem == 0)
{
while (srcABLength_32.quot --> 0)
{
const uint8x16x2_t ab = vld2q_u8(srcAB);
vst1q_u8(dstA, ab.val[0]);
vst1q_u8(dstB, ab.val[1]);
srcAB += 32;
dstA += 16;
dstB += 16;
}
return;
}
// iterate 16-bytes at a time
div_t srcABLength_16 = div(srcABLength, 16);
if (srcABLength_16.rem == 0)
{
while (srcABLength_16.quot --> 0)
{
const uint8x8x2_t ab = vld2_u8(srcAB);
vst1_u8(dstA, ab.val[0]);
vst1_u8(dstB, ab.val[1]);
srcAB += 16;
dstA += 8;
dstB += 8;
}
return;
}
#endif
// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration
// iterate 8-bytes at a time
div_t srcABLength_8 = div(srcABLength, 8);
if (srcABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;
uint64_t *srcAB64 = (uint64_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.wide = srcAB64[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
dstA[j ] = cursor.narrow.a3;
dstB[j++] = cursor.narrow.b3;
dstA[j ] = cursor.narrow.a4;
dstB[j++] = cursor.narrow.b4;
}
return;
}
// iterate 4-bytes at a time
div_t srcABLength_4 = div(srcABLength, 4);
if (srcABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;
uint32_t *srcAB32 = (uint32_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.wide = srcAB32[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
}
return;
}
// iterate 2-bytes at a time
div_t srcABLength_2 = div(srcABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;
uint16_t *srcAB16 = (uint16_t *)srcAB;
for (int i = 0; i < srcABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.wide = srcAB16[i];
dstA[i] = cursor.narrow.a;
dstB[i] = cursor.narrow.b;
}
}