这是 SSE (Intel)/MMX+ (AMD) 版本。
void block_8x8(uint8_t *block1,
uint8_t *block2,
int stride,
int *result)
{
const __m64 sad0 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad1 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad2 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad3 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad4 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad5 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad6 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
block1 += stride; block2 += stride;
const __m64 sad7 = _mm_sad_pu8(*((const __m64*)(block1)), *((const __m64*)(block2)));
*result = _mm_cvtsi64_si32(_mm_add_pi32(
_mm_add_pi32(
_mm_add_pi32(sad0, sad1),
_mm_add_pi32(sad2, sad3)
),
_mm_add_pi32(
_mm_add_pi32(sad4, sad5),
_mm_add_pi32(sad6, sad7)
)
));
_mm_empty();
}