鉴于上面的评论讨论,我起草了一个使用该VMPSADBW
指令进行 8x8 运动估计的工作示例。我对 GCC-4.8.1 为此生成的内容有些失望,但这是一个非常好的开始。它包括两个测试来验证功能以及演示我的新功能的使用sad_block_8x8_range()
。
VMPSADBW
内部循环使用 8 个加载、8秒、7 个垂直添加、一个 shuffle 和一个缩减添加来计算 8x8 块与原始图像中的 8 个重叠块的 SAD 。用 屏蔽掉无效| 0xFFFFU
SAD 后,立即提供最低 SAD 及其索引PHMINPOSUW
,该指令提供寄存器中八个无符号 16 位值中最低值的最小值和索引,xmm
如果此 SAD 甚至低于当前最佳,它与所述索引一起保存。
/* Includes */
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <immintrin.h>
/* Typedefs */
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
/* Functions */
/**
*
*
* @param [in] orig A pointer into the image within which to run ME. Points
* to a base offset from which a window of maxDx pixels to
* the right and maxDy pixels down is explored to find the
* lowest SAD.
* @param [in] os The span of the original image.
* @param [in] ref A pointer to the 8x8 reference block being SAD-ed for in
* the original image.
* @param [in] rs The span of the 8x8 reference block.
* @param [in] maxDx The width of the search window for ME. Cannot be 0.
* @param [in] maxDy The height of the search window for ME. Cannot be 0.
* @param [out] sadOut The lowest SAD found.
* @param [out] dxOut The corresponding best vector found, x-coordinate.
* @param [out] dyOut The corresponding best vector found, y-coordinate.
*/
void sad_block_8x8_range(const u8* orig,
unsigned os,
const u8* ref,
unsigned rs,
unsigned maxDx,
unsigned maxDy,
unsigned* sadOut,
unsigned* dxOut,
unsigned* dyOut){
__m128 tmp01f, tmp23f, tmp45f, tmp67f;
__m128i tmp01, tmp23, tmp45, tmp67;
__m256i r01, r23, r45, r67;
__m256i o0, o1, o2, o3, o4, o5, o6, o7;
const u8* refTmp;
const u8* origTmp;
int i;
unsigned tmpDx, dx, dy, sad;
unsigned minDx = 0, minDy = 0, minSAD = 0xFFFF;
const static u16 MASKTBLw[] = {
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF,
};
const static __m128i* MASKTBL = (const __m128i*)MASKTBLw;
/* Load the eight rows of 8 bytes of the reference block. */
refTmp = ref;
tmp01f = _mm_loadl_pi(tmp01f, (const __m64*)(refTmp));refTmp+=rs;/* tmp_a = [ x x x x x x x x 7 6 5 4 3 2 1 0 ] */
tmp01f = _mm_loadh_pi(tmp01f, (const __m64*)(refTmp));refTmp+=rs;/* tmp_a = [ f e d c b a 9 8 7 6 5 4 3 2 1 0 ] */
tmp23f = _mm_loadl_pi(tmp23f, (const __m64*)(refTmp));refTmp+=rs;
tmp23f = _mm_loadh_pi(tmp23f, (const __m64*)(refTmp));refTmp+=rs;
tmp45f = _mm_loadl_pi(tmp45f, (const __m64*)(refTmp));refTmp+=rs;
tmp45f = _mm_loadh_pi(tmp45f, (const __m64*)(refTmp));refTmp+=rs;
tmp67f = _mm_loadl_pi(tmp67f, (const __m64*)(refTmp));refTmp+=rs;
tmp67f = _mm_loadh_pi(tmp67f, (const __m64*)(refTmp));
tmp01 = _mm_castps_si128(tmp01f);/* A cast is needed to integer. */
tmp23 = _mm_castps_si128(tmp23f);
tmp45 = _mm_castps_si128(tmp45f);
tmp67 = _mm_castps_si128(tmp67f);
/**
* Combine them into 4 ymm registers each holding two rows in duplicate;
* One in high half and once in low half.
*/
r01 = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp01), tmp01, 1);/* r_ab = [ f e d c b a 9 8 7 6 5 4 3 2 1 0 f e d c b a 9 8 7 6 5 4 3 2 1 0 ] */
r23 = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp23), tmp23, 1);
r45 = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp45), tmp45, 1);
r67 = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp67), tmp67, 1);
/* Iterate over x and y of search space. */
for(dy=0;dy<maxDy;dy++){
for(dx=0;dx<maxDx;dx+=8){
/* Broadcast 16-byte rows to both halves of ymm register */
origTmp = orig + dy*os + dx;
o0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
o6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));origTmp += os;
/**
* Define to 0 if the image can be guaranteed to always have 8 extra allocated
* bytes beyond its nominal end.
*/
#define NO_OVERALLOCATION 1
if(NO_OVERALLOCATION && maxDx-dx < 9){/* i.e., maxDx+7-dx < 16, the load size. */
/**
* Special-case code for last row.
* maxDx+7-dx is the number of bytes to be loaded.
* maxDx-dx is the number of valid elements.
*/
#if 1
__m128i dealigned = _mm_loadu_si128((const __m128i*)(origTmp+maxDx-dx-9));
__m128i shufmsk = _mm_add_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
_mm_set1_epi8(7));
shufmsk = _mm_add_epi8(shufmsk, _mm_set1_epi8(maxDx-dx));
o7 = _mm256_broadcastsi128_si256(_mm_shuffle_epi8(dealigned, shufmsk));
#else
u8 tmpArr[16] = {0};
for(i=0;i < maxDx+7-dx;i++){
tmpArr[i] = (orig+(dy+7)*os+dx)[i];
}
o7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(tmpArr)));
#endif
}else{
o7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(origTmp)));
}
/**
* ACTUAL ACTION.
*
* The upper 128-bit lane calculates the SAD for the right 4 bytes
* of each row of the reference block, while the lower 128-bit lane
* does similarly for the left 4 bytes of each row of the reference
* block.
*
* Once the individual SADs for each 4-byte half of each row are
* obtained against eight consecutive neighbours, add the sixteen
* 4-byte row halves to get the SADs for the full 8x8 blocks.
*
* After masking for invalid entries, find the minimum SAD and its
* index using PHMINPOSUW.
*
* Compare the old to the new SAD and if it is a record-setter, save
* it.
*/
/* MPSADBWs */
__m256i s0 = _mm256_mpsadbw_epu8(o0, r01, 1<<5 | 1<<3 | 0<<2 | 0<<0);
__m256i s1 = _mm256_mpsadbw_epu8(o1, r01, 1<<5 | 3<<3 | 0<<2 | 2<<0);
__m256i s2 = _mm256_mpsadbw_epu8(o2, r23, 1<<5 | 1<<3 | 0<<2 | 0<<0);
__m256i s3 = _mm256_mpsadbw_epu8(o3, r23, 1<<5 | 3<<3 | 0<<2 | 2<<0);
__m256i s4 = _mm256_mpsadbw_epu8(o4, r45, 1<<5 | 1<<3 | 0<<2 | 0<<0);
__m256i s5 = _mm256_mpsadbw_epu8(o5, r45, 1<<5 | 3<<3 | 0<<2 | 2<<0);
__m256i s6 = _mm256_mpsadbw_epu8(o6, r67, 1<<5 | 1<<3 | 0<<2 | 0<<0);
__m256i s7 = _mm256_mpsadbw_epu8(o7, r67, 1<<5 | 3<<3 | 0<<2 | 2<<0);
/* Accumulate half-row results together into half-block results */
s0 = _mm256_add_epi16(s0, s1);
s0 = _mm256_add_epi16(s0, s2);
s0 = _mm256_add_epi16(s0, s3);
s0 = _mm256_add_epi16(s0, s4);
s0 = _mm256_add_epi16(s0, s5);
s0 = _mm256_add_epi16(s0, s6);
s0 = _mm256_add_epi16(s0, s7);
/* Accumulate half-block results into block results */
__m128i t0 = _mm256_extracti128_si256(s0, 0);
__m128i t1 = _mm256_extracti128_si256(s0, 1);
__m128i t = _mm_add_epi16(t0, t1);
/* Find horizontal minimum using PHMINPOSUW */
__m128i hm = maxDx-dx < 8 ? MASKTBL[maxDx-dx] : _mm_setzero_si128();
__m128i h = _mm_minpos_epu16(_mm_or_si128(t, hm));
sad = (u16)_mm_extract_epi16(h, 0);
tmpDx = dx + (u16)_mm_extract_epi16(h, 1);
/* Save the result if it is the best so far. */
if(sad < minSAD){
minDx = tmpDx;
minDy = dy;
minSAD = sad;
}
}
}
sadOut && (*sadOut = minSAD);
dxOut && (*dxOut = minDx);
dyOut && (*dyOut = minDy);
}
/**
* MAIN.
*
* Runs two testcases.
*/
int main(){
const u8 ref[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
};
const u8 img0[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
};
const u8 img1[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x40,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
};
unsigned sad, dx, dy;
sad_block_8x8_range(img0, 16, ref, 8, 9, 9, &sad, &dx, &dy);
if(sad == 0 && dx == 7 && dy == 3){
printf("Test 1 PASSED!\n");
}else{
printf("Test 1 FAILED! (SAD = %u, MV=(%u, %u))\n", sad, dx, dy);
}
sad_block_8x8_range(img1, 16, ref, 8, 9, 9, &sad, &dx, &dy);
if(sad == 1 && dx == 8 && dy == 4){
printf("Test 2 PASSED!\n");
}else{
printf("Test 2 FAILED! (SAD = %u, MV=(%u, %u))\n", sad, dx, dy);
}
return 0;
}