c++ - 极慢的双线性插值（与 OpenCV 相比）

Question

template<typename T>
cv::Mat_<T> const bilinear_interpolation(cv::Mat_<T> const &src, cv::Size dsize,
                                     float dx, float dy)
{
    cv::Mat_<T> dst = dsize.area() == 0 ? cv::Mat_<T>(src.rows * dy, src.cols * dx) :
                                        cv::Mat_<T>(dsize);
  
    float const x_ratio = static_cast<float>((src.cols - 1)) / dst.cols;
    float const y_ratio = static_cast<float>((src.rows - 1)) / dst.rows;
    for(int row = 0; row != dst.rows; ++row)
    {
        int y = static_cast<int>(row * y_ratio);
        float const y_diff = (row * y_ratio) - y; //distance of the nearest pixel(y axis)
        float const y_diff_2 = 1 - y_diff;
        auto *dst_ptr = &dst(row, 0)[0];
        for(int col = 0; col != dst.cols; ++col)
        {
            int x = static_cast<int>(col * x_ratio);
            float const x_diff = (col * x_ratio) - x; //distance of the nearest pixel(x axis)
            float const x_diff_2 = 1 - x_diff;
            float const y2_cross_x2 = y_diff_2 * x_diff_2;
            float const y2_cross_x = y_diff_2 * x_diff;
            float const y_cross_x2 = y_diff * x_diff_2;
            float const y_cross_x = y_diff * x_diff;
            for(int channel = 0; channel != cv::DataType<T>::channels; ++channel)
            {
                *dst_ptr++ = y2_cross_x2 * src(y, x)[channel] +
                             y2_cross_x * src(y, x + 1)[channel] +
                             y_cross_x2 * src(y + 1, x)[channel] +
                             y_cross_x * src(y + 1, x + 1)[channel];
            }
        }
    }
    
    return dst;
}

这是双线性插值的实现，我用它来将 512 * 512 图像（“lena.png”）放大到 2048 * 2048。完成这项工作需要 0.195 秒，但 cv::resize （不是 GPU 版本) OpenCV 只需要 0.026 秒。我不知道是什么让我的程序这么慢（OpenCV 比我快了将近 750%），我想看看 OpenCV 调整大小的源代码，但我找不到它的实现。

你知道为什么 OpenCV 的调整大小会这么快或者我的双线性太慢吗？

    {
        timeEstimate<> time;
        cv::Mat_<cv::Vec3b> const src = input;
        bilinear_interpolation(src, cv::Size(), dx, dy);
        std::cout << "bilinear" << std::endl;
    }

    {
        timeEstimate<> time;
        cv::Mat output = input.clone();
        cv::resize(input, output, cv::Size(), dx, dy, cv::INTER_LINEAR);
        std::cout << "bilinear cv" << std::endl;
    }

编译器：mingw4.6.2 操作系统：win7 64bits cpu：英特尔® i3-2330M (2.2G)

score 5 · Accepted Answer

有两个主要因素使 OpenCV 的版本更快：

OpenCV 将调整大小实现为“可分离操作”。即它分两步完成：图像水平拉伸，然后垂直拉伸。这种技术允许使用较少的算术运算来调整大小。
手工编码的 SSE 优化。

score 2 · Accepted Answer

最近在一些基于 CPU 的图形代码中添加双线性升级时，我遇到了同样的问题。

首先，我使用以下配置运行您的代码：

操作系统：虚拟机中的 Xubuntu 20
编译器：gcc 9.3.0
OpenCV 版本：4.2.0
CPU：i3-6100u (2.3 GHz)
源位图大小：512x512
目标位图大小：2048x2048

我发现你的代码用了 92ms 而 OpenCV 用了 4.2ms。所以现在的差异比你在 2012 年问这个问题时更大。我猜 OpenCV 从那时起优化得更多。

（此时我切换到在 Windows 中使用 Visual Studio 2013，为 x64 目标构建）。

将代码转换为使用定点算法将时间减少到 30 毫秒。定点算术很有帮助，因为将数据保持为整数。输入和输出数据是整数。必须将它们转换为浮动并再次返回是昂贵的。如果我坚持使用 GCC 9.3，我预计速度会更快，因为我通常发现它生成的代码比 VS 2013 更快。无论如何，这是代码：

typedef union {
    unsigned c;
    struct { unsigned char b, g, r, a; };
} DfColour;

typedef struct _DfBitmap {
    int width, height;
    DfColour *pixels;
} DfBitmap;

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    // For every output pixel...
    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < dstW; x++, dstPixel++) {
            // Perform bilinear interpolation on 2x2 src pixels.

            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned r = 0, g = 0, b = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[srcX];
            unsigned w = (weightX * weightY) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 1,0
            srcPixel++;
            w = (weightX2 * weightY) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 1,1
            srcPixel += src->width;
            w = (weightX2 * weightY2) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            // Pixel 0,1
            srcPixel--;
            w = (weightX * weightY2) >> 8;
            r += srcPixel->r * w;
            g += srcPixel->g * w;
            b += srcPixel->b * w;

            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }
}

切换到更好的算法将时间减少到 19.5 毫秒。正如 Andrey Kamaev 的回答所说，更好的算法通过将垂直和水平调整大小分成两个单独的通道来工作。目标位图用作第一遍输出的临时存储空间。第二遍中的 X 遍历是向后的，以避免覆盖它即将需要的数据。这是代码：

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++, dstPixel++) {
            unsigned r = 0, g = 0, b = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            r += srcPixel->r * weightY;
            g += srcPixel->g * weightY;
            b += srcPixel->b * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            r += srcPixel->r * weightY2;
            g += srcPixel->g * weightY2;
            b += srcPixel->b * weightY2;

            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }

    for (int y = 0; y < dstH; y++) {
        DfColour *dstRow = &dst->pixels[y * dst->width];

        for (int x = dstW - 1; x; x--) {
            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned r = 0, g = 0, b = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            r += srcPixel->r * weightX;
            g += srcPixel->g * weightX;
            b += srcPixel->b * weightX;

            // Pixel 0,1
            srcPixel++;
            r += srcPixel->r * weightX2;
            g += srcPixel->g * weightX2;
            b += srcPixel->b * weightX2;

            DfColour *dstPixel = &dstRow[x];
            dstPixel->r = r >> 8;
            dstPixel->g = g >> 8;
            dstPixel->b = b >> 8;
        }
    }
}

使用简单的便携式 SIMD 方案将时间减少到 16.5 毫秒。SIMD 方案不使用 SSE/AVX 等专有指令集扩展。相反，它使用 hack 允许以 32 位整数存储和操作红色和蓝色通道。它不如 AVX 实现快，但它具有简单的优点。这是代码：

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstPixel = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++, dstPixel++) {
            unsigned rb = 0, g = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            rb += (srcPixel->c & 0xff00ff) * weightY;
            g += srcPixel->g * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            rb += (srcPixel->c & 0xff00ff) * weightY2;
            g += srcPixel->g * weightY2;

            dstPixel->c = rb >> 8;
            dstPixel->g = g >> 8;
        }
    }

    for (int y = 0; y < dstH; y++) {
        DfColour *dstRow = &dst->pixels[y * dst->width];

        for (int x = dstW - 1; x; x--) {
            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;

            unsigned rb = 0, g = 0;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            rb += (srcPixel->c & 0xff00ff) * weightX;
            g += srcPixel->g * weightX;

            // Pixel 0,1
            srcPixel++;
            rb += (srcPixel->c & 0xff00ff) * weightX2;
            g += srcPixel->g * weightX2;

            DfColour *dstPixel = &dstRow[x];
            dstPixel->c = rb >> 8;
            dstPixel->g = g >> 8;
        }
    }
}

可以将 X 轴通道分开，但将 Y 轴通道组合起来。这提高了缓存的一致性并使代码更简单一些。重新组合这两个通道将时间减少到 14.6 毫秒。这是代码：

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
    unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
    unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
    int dstH = scale * src->height;
    int dstW = scale * src->width;

    for (int y = 0; y < dstH; y++) {
        int srcYAndWeight = (y * heightRatio) >> 8;
        int srcY = srcYAndWeight >> 8;

        DfColour *dstRow = &dst->pixels[y * dst->width];
        DfColour *srcRow = &src->pixels[srcY * src->width];

        unsigned weightY2 = srcYAndWeight & 0xFF;
        unsigned weightY = 256 - weightY2;

        for (int x = 0; x < src->width; x++) {
            unsigned rb = 0, g = 0;

            // Pixel 0,0
            DfColour *srcPixel = &srcRow[x];
            rb += (srcPixel->c & 0xff00ff) * weightY;
            g += srcPixel->g * weightY;

            // Pixel 1,0
            srcPixel += src->width;
            rb += (srcPixel->c & 0xff00ff) * weightY2;
            g += srcPixel->g * weightY2;

            dstRow[x].c = rb >> 8;
            dstRow[x].g = g >> 8;
        }

        for (int x = dstW - 1; x; x--) {
            unsigned rb = 0, g = 0;

            int srcXAndWeight = (x * widthRatio) >> 8;
            int srcX = srcXAndWeight >> 8;
            unsigned weightX2 = srcXAndWeight & 0xFF;
            unsigned weightX = 256 - weightX2;

            // Pixel 0,0
            DfColour *srcPixel = &dstRow[srcX];
            rb += (srcPixel->c & 0xff00ff) * weightX;
            g += srcPixel->g * weightX;

            // Pixel 0,1
            srcPixel++;
            rb += (srcPixel->c & 0xff00ff) * weightX2;
            g += srcPixel->g * weightX2;

            dstRow[x].c = rb >> 8;
            dstRow[x].g = g >> 8;
        }
    }
}

此时代码仍然是单线程的。我的 CPU 总共有两个物理内核和 4 个线程。OpenCV 在我的机器上使用 2 个线程。我希望将代码转换为使用 2 个线程会将时间减少到大约 8 毫秒。

我不知道需要什么其他技巧才能将时间缩短到 4 毫秒，尽管可能需要转换为真正的 AVX SIMD 实现。

score 0 · Accepted Answer

也许有点晚了，但还要检查您是否在调试模式下运行您的应用程序。OpenCV 是一个库，很可能会被编译为发布 - 带有编译器优化。

c++ - 极慢的双线性插值（与 OpenCV 相比）

3 回答 3

Related

Reference