_mm_avg_epu16
通过 .提供两个无符号 16 位整数的平均值PAVGW
。正在转换为float
并除以2.
使用 SSE 的唯一适当(最佳)方法来获得两个有符号 16 位整数的平均值,即“有符号平均值,四舍五入,然后取反最高位”(@Mysticial),或者还有另一个方式?
编辑:这是我要优化的代码,到目前为止,我使用 SSE 的所有尝试都接近但不完全匹配,通常是围绕饱和/溢出包装的问题:
int16_t *a;
int16_t *b;
uint16_t *out;
out[i] = int((a[i] + b[i]) / 2.0f + 32768.5f)
尝试#1:
const __m128i outputVal = _mm_add_epi16(_mm_avg_epu16(a, b), _mm_set1_epi16(32768));
尝试#2:
const __m128i sum = _mm_add_epi16(a, b);
const __m128i outputVal = _mm_add_epi16(_mm_srai_epi16(sum, 1), _mm_set1_epi16(32768));
尝试#3:
const __m128 elt_offset = _mm_set1_ps(32768.5f);
const __m128 avg_divisor = _mm_set1_ps(2.f);
const __m128i eltSum = _mm_add_epi16(edgeRowElts, edgeInnerRowElts); /* eltSum = int((inputData[i] + inputData[i + (direction*x)]) */
const __m64 eltSumLow = _mm_movepi64_pi64(eltSum); /* eltSumLow = (__m64) (0x0ffffffff & eltSum) */
const __m64 eltSumHigh = _mm_movepi64_pi64(_mm_srli_si128(eltSum, 8)); /* eltSumHigh = (__m64) (0x0ffffffff & (eltSum >> 64)) */
/* Lower */
__m128 eltSumF = _mm_cvtpi16_ps(eltSumLow); /* eltSumF = (float) eltSum; */
__m128 eltAvg = _mm_div_ps(eltSumF, avg_divisor); /* eltAvg = eltSum / 2.0f */
__m128 eltAvgOffset = _mm_add_ps(eltAvg, elt_offset); /* eltAvgOffset = eltAvg + 32768.5f */
const __m64 outputValLow = _mm_cvtps_pi16(eltAvgOffset); /* outputVal = (short) eltAvgOffset */
/* Upper */
eltSumF = _mm_cvtpi16_ps(eltSumHigh); /* eltSumF = (float) eltSum; */
eltAvg = _mm_div_ps(eltSumF, avg_divisor); /* eltAvg = eltSum / 2.0f */
eltAvgOffset = _mm_add_ps(eltAvg, elt_offset); /* eltAvgOffset = eltAvg + 32768.5f */
const __m64 outputValHigh = _mm_cvtps_pi16(eltAvgOffset); /* outputVal = (short) eltAvgOffset */
__m128i outputVal = _mm_slli_si128(_mm_movpi64_epi64(outputValHigh), 8); /* outputVal = (outputValHigh << 64); */
outputVal = _mm_or_si128(outputVal, _mm_movpi64_epi64(outputValLow)); /* outputVal = outputVal | (outputValLow); */