1

我已经实现了可分离的高斯模糊。使用 SIMD 处理来优化水平通道相对容易。但是,我不确定如何优化垂直通道。

访问元素对缓存不是很友好,填充 SIMD 通道意味着读取许多不同的像素。我正在考虑转置图像并运行水平传递,然后将图像转回,但是,我不确定它是否会因为两个转置操作而获得任何改进。

我有相当大的图像,分辨率为 16k,内核大小为 19,因此垂直通过增益的矢量化约为 15%。

我的 Vertical pass 如下(它是类型为 T 的 sinde 泛型类,可以是 uint8_t 或 float):

int yStart = kernelHalfSize;
int xStart = kernelHalfSize;
int yEnd = input.GetWidth() - kernelHalfSize;
int xEnd = input.GetHeigh() - kernelHalfSize;

const T * inData = input.GetData().data();
V * outData = output.GetData().data();

int kn = kernelHalfSize * 2 + 1;
int kn4 = kn - kn % 4;

for (int y = yStart; y < yEnd; y++)
{
    size_t yW = size_t(y) * output.GetWidth();
    size_t outX = size_t(xStart) + yW;

    size_t xEndSimd = xStart;

    int len = xEnd - xStart;
    len = len - len % 4;
    xEndSimd = xStart + len;

    for (int x = xStart; x < xEndSimd; x += 4)
    {
        size_t inYW = size_t(y) * input.GetWidth();
        size_t x0 = ((x + 0) - kernelHalfSize) + inYW;
        size_t x1 = x0 + 1;
        size_t x2 = x0 + 2;
        size_t x3 = x0 + 3;

        __m128 sumDot = _mm_setzero_ps();

        int i = 0;
        for (; i < kn4; i += 4)
        {               
            __m128 kx = _mm_set_ps1(kernelDataX[i + 0]);
            __m128 ky = _mm_set_ps1(kernelDataX[i + 1]);
            __m128 kz = _mm_set_ps1(kernelDataX[i + 2]);
            __m128 kw = _mm_set_ps1(kernelDataX[i + 3]);

            
            __m128 dx, dy, dz, dw;

            if constexpr (std::is_same<T, uint8_t>::value)
            {
                //we need co convert uint8_t inputs to float
                __m128i u8_0 = _mm_loadu_si128((const __m128i*)(inData + x0));
                __m128i u8_1 = _mm_loadu_si128((const __m128i*)(inData + x1));
                __m128i u8_2 = _mm_loadu_si128((const __m128i*)(inData + x2));
                __m128i u8_3 = _mm_loadu_si128((const __m128i*)(inData + x3));

                __m128i u32_0 = _mm_unpacklo_epi16(
                    _mm_unpacklo_epi8(u8_0, _mm_setzero_si128()),
                    _mm_setzero_si128());
                __m128i u32_1 = _mm_unpacklo_epi16(
                    _mm_unpacklo_epi8(u8_1, _mm_setzero_si128()),
                    _mm_setzero_si128());
                __m128i u32_2 = _mm_unpacklo_epi16(
                    _mm_unpacklo_epi8(u8_2, _mm_setzero_si128()),
                    _mm_setzero_si128());
                __m128i u32_3 = _mm_unpacklo_epi16(
                    _mm_unpacklo_epi8(u8_3, _mm_setzero_si128()),
                    _mm_setzero_si128());

                dx = _mm_cvtepi32_ps(u32_0);
                dy = _mm_cvtepi32_ps(u32_1);
                dz = _mm_cvtepi32_ps(u32_2);
                dw = _mm_cvtepi32_ps(u32_3);

            }
            else
            {
                /*
                //load 8 consecutive values
                auto dd = _mm256_loadu_ps(inData + x0);

                //extract parts by shifting and casting to 4 values float
                dx = _mm256_castps256_ps128(dd);
                dy = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 4, 3, 2, 1)));
                dz = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 5, 4, 3, 2)));
                dw = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(dd, _mm256_set_epi32(0, 0, 0, 0, 6, 5, 4, 3)));
                */

                dx = _mm_loadu_ps(inData + x0);
                dy = _mm_loadu_ps(inData + x1);
                dz = _mm_loadu_ps(inData + x2);
                dw = _mm_loadu_ps(inData + x3);
            }

            //calculate 4 dots at once
            //[dx, dy, dz, dw] <dot> [kx, ky, kz, kw]

            auto mx = _mm_mul_ps(dx, kx); //dx * kx
            auto my = _mm_fmadd_ps(dy, ky, mx); //mx + dy * ky
            auto mz = _mm_fmadd_ps(dz, kz, my); //my + dz * kz
            auto res = _mm_fmadd_ps(dw, kw, mz); //mz + dw * kw

            sumDot = _mm_add_ps(sumDot, res);

            x0 += 4;
            x1 += 4;
            x2 += 4;
            x3 += 4;
        }

        for (; i < kn; i++)
        {               
            auto v = _mm_set_ps1(kernelDataX[i]);
            auto v2 = _mm_set_ps(
                *(inData + x3), *(inData + x2), 
                *(inData + x1), *(inData + x0)
            );
            
            sumDot = _mm_add_ps(sumDot, _mm_mul_ps(v, v2));

            x0++;
            x1++;
            x2++;
            x3++;
        }

        sumDot = _mm_mul_ps(sumDot, _mm_set_ps1(weightX));

        if constexpr (std::is_same<V, uint8_t>::value)
        {
            __m128i asInt = _mm_cvtps_epi32(sumDot);
            
            asInt = _mm_packus_epi32(asInt, asInt);
            asInt = _mm_packus_epi16(asInt, asInt);

            uint32_t res = _mm_cvtsi128_si32(asInt);

            ((uint32_t *)(outData + outX))[0] = res;                
            outX += 4;
        }
        else 
        {
            float tmpRes[4];
            _mm_store_ps(tmpRes, sumDot);

            outData[outX + 0] = tmpRes[0];
            outData[outX + 1] = tmpRes[1];
            outData[outX + 2] = tmpRes[2];
            outData[outX + 3] = tmpRes[3];
            outX += 4;
        }
        
    }   

    for (int x = xEndSimd; x < xEnd; x++)
    {
        int kn = kernelHalfSize * 2 + 1;
        const T * v = input.GetPixelStart(x - kernelHalfSize, y);
        float tmp = 0;
        for (int i = 0; i < kn; i++)
        {
            tmp += kernelDataX[i] * v[i];
        }
        tmp *= weightX;
        outData[outX] = ImageUtils::clamp_cast<V>(tmp);
        outX++;
    }
}
4

1 回答 1

2

有一个众所周知的技巧。

当您计算两个通道时,按顺序读取它们,使用 SIMD 进行计算,但使用标量存储将结果写入另一个缓冲区,转置。Protip:SSE 4.1_mm_extract_ps只是不要忘记将目标图像指针float*int*. 关于这些商店的另一件事,我建议您使用_mm_stream_si32它,因为您希望输入数据使用最大的缓存空间。当您计算第二遍时,您将再次读取顺序内存地址,预取器硬件将处理延迟。

这样两次传递都是相同的,我通常两次调用相同的函数,使用不同的缓冲区。

由您的 2 次传递引起的两次转置相互抵消。这是一个HLSL 版本,顺便说一句。

还有更多。如果您的内核大小只有 19,则适合 3 个 AVX 寄存器。我认为 shuffle/permute/blend 指令仍然比 L1 缓存加载更快,即在循环外加载内核可能会更好。

于 2020-06-30T16:48:32.407 回答