我正在尝试对 d0、d1、d2 + d3、d4、d5+ d6、d7、d8 求和。我不知道最好的指令,然后取平均值为 9。我知道如何使用近似值进行平均,但是将这些车道求和,我找不到相应的指令?我也有不正确的输出图像,所以我怀疑平均操作是否正确。
inline void downsample3dOnePass( uint8_t* src, uint8_t *dst, int srcWidth)
{
for (int r = 0; r < (int)srcWidth/3; r++)
{
// load 24 pixels (grayscale)
uint8x8x3_t r0 = vld3_u8(src);
// move to next 24 byes
src+=24;
uint8x8x3_t r1 = vld3_u8(src);
src+=24;
uint8x8x3_t r2 = vld3_u8(src);
uint16x8_t d0 = vmovl_u8(r0.val[0]);
uint16x8_t d1 = vmovl_u8(r0.val[1]);
uint16x8_t d2 = vmovl_u8(r0.val[2]);
uint16x8_t d3 = vmovl_u8(r1.val[0]);
uint16x8_t d4 = vmovl_u8(r1.val[1]);
uint16x8_t d5 = vmovl_u8(r1.val[2]);
uint16x8_t d6 = vmovl_u8(r2.val[0]);
uint16x8_t d7 = vmovl_u8(r2.val[1]);
uint16x8_t d8 = vmovl_u8(r2.val[2]);
uint16x8_t d0d3Sum = vaddq_u16 ( d0, d3);
uint16x8_t d0d3d6Sum = vaddq_u16 ( d0d3Sum, d6 );
uint16x8_t d1d4Sum = vaddq_u16 ( d1, d4);
uint16x8_t d1d4d7Sum = vaddq_u16 ( d1d4Sum, d7);
uint16x8_t d2d5Sum = vaddq_u16 ( d2, d5 );
uint16x8_t d2d5d8Sum = vaddq_u16 ( d2d5Sum, d8);
uint16x8_t firstSum = vaddq_u16(d0d3d6Sum, d1d4d7Sum);
uint16x8_t secondSum = vaddq_u16(firstSum, d2d5d8Sum);
uint16x8_t totalSum = vaddq_u16 ( firstSum, secondSum);
// average = r0+r1+r2/8 ~9 for test
uint16x8_t totalAverage = vshrq_n_u16(totalSum,3);
uint8x8_t finalValue = vmovn_u16(totalAverage);
// store 8 bytes
vst1_u8(dst, finalValue);
src+=24;
// move to next row
dst+=8;
}
}
void downsample3d( uint8_t* src, uint8_t *dest, int srcWidth, int srcHeight )
{
for (int r = 0; r < (int)srcHeight/3; r++)
{
downsample3dOnePass(src, dest, srcWidth);
}
}
更新:根据bitbank的回答:
inline void downsample3dOnePass( uint8_t* src, uint8_t *dst, int srcWidth, int srcHeight, int strideSrc, int strideDest)
{
int iDestPitch = (strideDest);
uint8_t *s, *d;
uint8x8x3_t u88line0;
uint8x8x3_t u88line1;
uint8x8x3_t u88line2;
uint8x8_t u88Final;
uint16x8_t u168Sum;
int16x8_t i168divisor = vdupq_n_s16(7282/2); // 65536/9 - used with doubling saturating return high multiply
for (int r = 0; r < srcHeight/3; r++)
{
d = &dst[iDestPitch * r];
s = &src[srcWidth * r*3];
for (int c = 0; c < srcWidth/3; c+=8)
{
// load 8 sets of 3x3 pixels (grayscale)
u88line0 = vld3_u8(&s[0]);
u88line1 = vld3_u8(&s[srcWidth]);
u88line2 = vld3_u8(&s[srcWidth*2]);
s += 24;
// Sum vertically
u168Sum = vaddl_u8(u88line0.val[0], u88line0.val[1]); // add with widening
u168Sum = vaddw_u8(u168Sum, u88line0.val[2]); // accumulate with widening (horizontally)
u168Sum = vaddw_u8(u168Sum, u88line1.val[0]); // add the other vectors together
u168Sum = vaddw_u8(u168Sum, u88line1.val[1]);
u168Sum = vaddw_u8(u168Sum, u88line1.val[2]);
u168Sum = vaddw_u8(u168Sum, u88line2.val[0]);
u168Sum = vaddw_u8(u168Sum, u88line2.val[1]);
u168Sum = vaddw_u8(u168Sum, u88line2.val[2]);
// we now have the 8 sets of 3x3 pixels summed to 8 16-bit values
// To divide by 9 we will instead multiply by the inverse (65536/9) = 7282
u168Sum = vreinterpretq_u16_s16(vqrdmulhq_s16(i168divisor, vreinterpretq_s16_u16(u168Sum)));
u88Final = vmovn_u16(u168Sum); // narrow to 8 bits
// store 8 bytes
vst1_u8(d, u88Final);
d += 8;
} // for column
} // for row
}
usage:
//1280*920*grayscale
QImage normalImage("/data/normal_image.png");
uint8_t *resultImage = new uint8_t[440*306];
downsample3dOnePass(normalImage.bits(),resultImage, normalImage.width(), normalImage.height(), 1280, 440);