0

我正在尝试对 d0、d1、d2 + d3、d4、d5+ d6、d7、d8 求和。我不知道最好的指令,然后取平均值为 9。我知道如何使用近似值进行平均,但是将这些车道求和,我找不到相应的指令?我也有不正确的输出图像,所以我怀疑平均操作是否正确。

inline void downsample3dOnePass( uint8_t* src, uint8_t *dst, int srcWidth)
{

    for (int r = 0; r < (int)srcWidth/3; r++)
    {
       // load 24 pixels (grayscale)
       uint8x8x3_t r0       = vld3_u8(src);
       // move to next 24 byes
       src+=24;
       uint8x8x3_t r1       = vld3_u8(src);
       src+=24;
       uint8x8x3_t r2       = vld3_u8(src);

       uint16x8_t  d0  = vmovl_u8(r0.val[0]);
       uint16x8_t  d1  = vmovl_u8(r0.val[1]);
       uint16x8_t  d2  = vmovl_u8(r0.val[2]);

       uint16x8_t  d3  = vmovl_u8(r1.val[0]);
       uint16x8_t  d4  = vmovl_u8(r1.val[1]);
       uint16x8_t  d5  = vmovl_u8(r1.val[2]);

       uint16x8_t  d6  = vmovl_u8(r2.val[0]);
       uint16x8_t  d7  = vmovl_u8(r2.val[1]);
       uint16x8_t  d8  = vmovl_u8(r2.val[2]);

       uint16x8_t d0d3Sum      = vaddq_u16 ( d0, d3);
       uint16x8_t d0d3d6Sum    = vaddq_u16 ( d0d3Sum,  d6 );

       uint16x8_t d1d4Sum      = vaddq_u16 ( d1, d4);
       uint16x8_t d1d4d7Sum    = vaddq_u16 ( d1d4Sum, d7);

       uint16x8_t d2d5Sum      = vaddq_u16 ( d2, d5 );
       uint16x8_t d2d5d8Sum    = vaddq_u16 ( d2d5Sum, d8);

       uint16x8_t firstSum     = vaddq_u16(d0d3d6Sum, d1d4d7Sum);
       uint16x8_t secondSum    = vaddq_u16(firstSum, d2d5d8Sum);
       uint16x8_t totalSum     = vaddq_u16 ( firstSum, secondSum);

       // average = r0+r1+r2/8 ~9 for test
       uint16x8_t totalAverage = vshrq_n_u16(totalSum,3);
       uint8x8_t  finalValue   = vmovn_u16(totalAverage);
       // store 8 bytes
       vst1_u8(dst, finalValue);

       src+=24;
       // move to next row
       dst+=8;

   }

}

void downsample3d( uint8_t* src, uint8_t *dest, int srcWidth, int srcHeight )
{
    for (int r = 0; r < (int)srcHeight/3; r++)
    {
         downsample3dOnePass(src, dest, srcWidth);
    }
}

更新:根据bitbank的回答:

    inline void downsample3dOnePass( uint8_t* src, uint8_t *dst, int srcWidth, int srcHeight, int strideSrc, int strideDest)
    {
        int iDestPitch = (strideDest);
        uint8_t *s, *d;
        uint8x8x3_t u88line0;
        uint8x8x3_t u88line1;
        uint8x8x3_t u88line2;
        uint8x8_t   u88Final;
        uint16x8_t  u168Sum;
        int16x8_t   i168divisor = vdupq_n_s16(7282/2); // 65536/9 - used with doubling saturating return high multiply

        for (int r = 0; r < srcHeight/3; r++)
        {
            d = &dst[iDestPitch * r];
            s = &src[srcWidth * r*3];

            for (int c = 0; c < srcWidth/3; c+=8)
            {
                // load 8 sets of 3x3 pixels (grayscale)
                u88line0 = vld3_u8(&s[0]);
                u88line1 = vld3_u8(&s[srcWidth]);
                u88line2 = vld3_u8(&s[srcWidth*2]);
                s += 24;
                // Sum vertically
                u168Sum = vaddl_u8(u88line0.val[0], u88line0.val[1]); // add with widening
                u168Sum = vaddw_u8(u168Sum, u88line0.val[2]); // accumulate with widening (horizontally)
                u168Sum = vaddw_u8(u168Sum, u88line1.val[0]); // add the other vectors together
                u168Sum = vaddw_u8(u168Sum, u88line1.val[1]);
                u168Sum = vaddw_u8(u168Sum, u88line1.val[2]);
                u168Sum = vaddw_u8(u168Sum, u88line2.val[0]);
                u168Sum = vaddw_u8(u168Sum, u88line2.val[1]);
                u168Sum = vaddw_u8(u168Sum, u88line2.val[2]);
                // we now have the 8 sets of 3x3 pixels summed to 8 16-bit values
                // To divide by 9 we will instead multiply by the inverse (65536/9) = 7282
                u168Sum = vreinterpretq_u16_s16(vqrdmulhq_s16(i168divisor, vreinterpretq_s16_u16(u168Sum)));
                u88Final = vmovn_u16(u168Sum); // narrow to 8 bits
                // store 8 bytes
                vst1_u8(d, u88Final);
                d += 8;
            } // for column
        } // for row
    }


usage: 
//1280*920*grayscale
QImage normalImage("/data/normal_image.png");

uint8_t *resultImage = new uint8_t[440*306];
  downsample3dOnePass(normalImage.bits(),resultImage, normalImage.width(), normalImage.height(), 1280, 440);
4

2 回答 2

3

您的代码有几个问题。NEON 内在函数在 VLDx 处理方面非常糟糕,但您的大错误是您溢出了字节值并水平而不是垂直加载像素。这是一个更好的算法,它将一次将 8*3x3 源像素处理成 8 个目标像素。您的函数也缺少 rows 参数。

inline void downsample3dOnePass( uint8_t* src, uint8_t *dst, int srcWidth, int srcHeight)
{
int iDestPitch = ((srcWidth/3)+3) & 0xfffffffc; // DWORD aligned
uint8_t *s, *d;
uint8x8x3_t u88line0, u88line, u88line2;
uint8x8_t u88Final;
uint16x8_t u168Sum;
int16x8_t i168divisor = vdupq_n_s16(7282/2); // 65536/9 - used with doubling saturating return high multiply

  for (int r = 0; r < srcHeight/3; r++)
    {
    d = &dst[iDestPitch * r];
    s = &src[srcWidth * r*3];

    for (int c = 0; c < srcWidth/3; c+=8)
    {
       // load 8 sets of 3x3 pixels (grayscale)
       u88line0 = vld3_u8(&s[0]);
       u88line1 = vld3_u8(&s[srcWidth]);
       u88line2 = vld3_u8(&s[srcWidth*2]);
       s += 24;
       // Sum vertically
       u168Sum = vaddl_u8(u88Line0.val[0], u88Line0.val[1]); // add with widening
       u168Sum = vaddw_u8(u168Sum, u88Line0.val[2]); // accumulate with widening (horizontally)
       u168Sum = vaddw_u8(u168Sum, u88Line1.val[0]); // add the other vectors together
       u168Sum = vaddw_u8(u168Sum, u88Line1.val[1]);
       u168Sum = vaddw_u8(u168Sum, u88Line1.val[2]);
       u168Sum = vaddw_u8(u168Sum, u88Line2.val[0]);
       u168Sum = vaddw_u8(u168Sum, u88Line2.val[1]);
       u168Sum = vaddw_u8(u168Sum, u88Line2.val[2]);
       // we now have the 8 sets of 3x3 pixels summed to 8 16-bit values   
       // To divide by 9 we will instead multiply by the inverse (65536/9) = 7282
       u168Sum = vreinterpretq_u16_s16(vqrdmulhq_s16(i168divisor, vreinterpretq_s16_u16(u168Sum)));
       u88Final = vmovn_u16(u168Sum); // narrow to 8 bits
       // store 8 bytes
       vst1_u8(d, u88Final);
       d += 8;    
   } // for column
} // for row
于 2013-03-20T16:06:36.193 回答
0

为了避免在将几个向量的字节相加时发生溢出,您应该在求和之前从字节扩展为半字(16 位)。将所有像素相加并划分结果后,您可以将结果缩小回字节。

用于在 GCC 中将字节扩展为半字的 NEON 内在函数是
uint16x8_t vmovl_u8 (uint8x8_t)

相应的缩小内在函数是
uint8x8_t vmovn_u16 (uint16x8_t)

请注意,如果将 9 个像素相加并除以 8,则在缩小回字节时仍可能存在溢出风险。在这种情况下,您可以使用vqmovn_u16which 表现得像vmovn_u16但也执行饱和。

于 2013-03-20T15:00:16.240 回答