0

我已经在 vivado hls 中编写了一个非局部均值去噪算法并尝试对其进行优化,目前,我保留了浮点表示,因为它是问题描述的一部分,我需要检查浮点表示中的细节和然后继续进行定点表示。我正在尝试各种 pragma,如流水线、数组分区等,但实现的延迟约为 880 毫秒,这对于图像处理应用程序来说并不是空闲的。通过使用基本的编译指示,我能够获得 5 毫秒,但它显示 fpga 板用完了 DSP 和触发器和 LUT 等其他资源也相当高(我选择了可​​用的最大开发板)。

我对 Vivado hls 比较陌生,想听听一些关于如何拥有完美管道 (II =1) 以减少延迟(以浮点表示形式)并同时花费更少的想法片上资源的数量。

我附上了下面的代码,

//header
struct pixel
{
    float r;
    float g;
    float b;
};

struct pixel_8
{
    uint8_t r;
    uint8_t g;
    uint8_t b;
};
typedef hls::Scalar<3, unsigned char>                 RGB_PIXEL;

//core
#include "denoise_h.hpp"
#define min_def(x,y) ( (x)>(y) ? (y) : (x) )
#define max_def(x,y) ( (x)>(y) ? (x) : (y) )
//using namespace std;

pixel_8 denoise_interpol(pixel win[5][5])
{
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_RESHAPE variable=win complete dim=1
    pixel dist_tl[3][3], dist_tm[3][3], dist_tr[3][3], dist_ml[3][3], dist_mm[3][3], dist_mr[3][3], dist_bl[3][3], dist_bm[3][3], dist_br[3][3];
    pixel_8 out_pix;
    float total_dist_tl[3][3], total_dist_tm[3][3], total_dist_tr[3][3], total_dist_ml[3][3], total_dist_mm[3][3], total_dist_mr[3][3], total_dist_bl[3][3], total_dist_bm[3][3], total_dist_br[3][3];
    float dy_l[3][3], dy_m[3][3], dy_r[3][3];
    float dx[3][3], weights[3][3];
    float alpha_red[3][3], alpha_green[3][3], alpha_blue[3][3], alpha_alpha[3][3];
    pixel nlms;
    float nlms_alpha;
    pixel nlm;
    int search_area = 3;
    int patch_size = 3;//dx
    float sigma = (float)0.12;//dy
    float inv_sigma_sq = -1.0f / (sigma*sigma*patch_size*patch_size);
    denoise_interpol_first_loop:for (int l = 0; l < 3; l++)
    {
        denoise_interpol_first_loop_inner:for (int m = 0; m < 3; m++)
        {

            //storing values in alpha window
            alpha_red[l][m] = win[l + 1][m + 1].r;
            alpha_green[l][m] = win[l + 1][m + 1].g;
            alpha_blue[l][m] = win[l + 1][m + 1].b;
            alpha_alpha[l][m] = 1.0;

            //red
            dist_tl[l][m].r = (win[l][m].r - win[1][1].r) * (win[l][m].r - win[1][1].r);
            dist_tm[l][m].r = (win[l][m + 1].r - win[1][2].r) * (win[l][m + 1].r - win[1][2].r);
            dist_tr[l][m].r = (win[l][m + 2].r - win[1][3].r) * (win[l][m + 2].r - win[1][3].r);

            dist_ml[l][m].r = (win[l + 1][m].r - win[2][1].r) * (win[l + 1][m].r - win[2][1].r);
            dist_mm[l][m].r = (win[l + 1][m + 1].r - win[2][2].r) * (win[l + 1][m + 1].r - win[2][2].r);
            dist_mr[l][m].r = (win[l + 1][m + 2].r - win[2][3].r) * (win[l + 1][m + 2].r - win[2][3].r);

            dist_bl[l][m].r = (win[l + 2][m].r - win[3][1].r) * (win[l + 2][m].r - win[3][1].r);
            dist_bm[l][m].r = (win[l + 2][m + 1].r - win[3][2].r) * (win[l + 2][m + 1].r - win[3][2].r);
            dist_br[l][m].r = (win[l + 2][m + 2].r - win[3][3].r) * (win[l + 2][m + 2].r - win[3][3].r);

            //green
            dist_tl[l][m].g = (win[l][m].g - win[1][1].g) * (win[l][m].g - win[1][1].g);
            dist_tm[l][m].g = (win[l][m + 1].g - win[1][2].g) * (win[l][m + 1].g - win[1][2].g);
            dist_tr[l][m].g = (win[l][m + 2].g - win[1][3].g) * (win[l][m + 2].g - win[1][3].g);

            dist_ml[l][m].g = (win[l + 1][m].g - win[2][1].g) * (win[l + 1][m].g - win[2][1].g);
            dist_mm[l][m].g = (win[l + 1][m + 1].g - win[2][2].g) * (win[l + 1][m + 1].g - win[2][2].g);
            dist_mr[l][m].g = (win[l + 1][m + 2].g - win[2][3].g) * (win[l + 1][m + 2].g - win[2][3].g);

            dist_bl[l][m].g = (win[l + 2][m].g - win[3][1].g) * (win[l + 2][m].g - win[3][1].g);
            dist_bm[l][m].g = (win[l + 2][m + 1].g - win[3][2].g) * (win[l + 2][m + 1].g - win[3][2].g);
            dist_br[l][m].g = (win[l + 2][m + 2].g - win[3][3].g) * (win[l + 2][m + 2].g - win[3][3].g);

            //blue
            dist_tl[l][m].b = (win[l][m].b - win[1][1].b) * (win[l][m].b - win[1][1].b);
            dist_tm[l][m].b = (win[l][m + 1].b - win[1][2].b) * (win[l][m + 1].b - win[1][2].b);
            dist_tr[l][m].b = (win[l][m + 2].b - win[1][3].b) * (win[l][m + 2].b - win[1][3].b);

            dist_ml[l][m].b = (win[l + 1][m].b - win[2][1].b) * (win[l + 1][m].b - win[2][1].b);
            dist_mm[l][m].b = (win[l + 1][m + 1].b - win[2][2].b) * (win[l + 1][m + 1].b - win[2][2].b);
            dist_mr[l][m].b = (win[l + 1][m + 2].b - win[2][3].b) * (win[l + 1][m + 2].b - win[2][3].b);

            dist_bl[l][m].b = (win[l + 2][m].b - win[3][1].b) * (win[l + 2][m].b - win[3][1].b);
            dist_bm[l][m].b = (win[l + 2][m + 1].b - win[3][2].b) * (win[l + 2][m + 1].b - win[3][2].b);
            dist_br[l][m].b = (win[l + 2][m + 2].b - win[3][3].b) * (win[l + 2][m + 2].b - win[3][3].b);

            //caluclating distances
            total_dist_tl[l][m] = dist_tl[l][m].r + dist_tl[l][m].g + dist_tl[l][m].b;
            total_dist_tm[l][m] = dist_tm[l][m].r + dist_tm[l][m].g + dist_tm[l][m].b;
            total_dist_tr[l][m] = dist_tr[l][m].r + dist_tr[l][m].g + dist_tr[l][m].b;

            total_dist_ml[l][m] = dist_ml[l][m].r + dist_ml[l][m].g + dist_ml[l][m].b;
            total_dist_mm[l][m] = dist_mm[l][m].r + dist_mm[l][m].g + dist_mm[l][m].b;
            total_dist_mr[l][m] = dist_mr[l][m].r + dist_mr[l][m].g + dist_mr[l][m].b;

            total_dist_bl[l][m] = dist_bl[l][m].r + dist_bl[l][m].g + dist_bl[l][m].b;
            total_dist_bm[l][m] = dist_bm[l][m].r + dist_bm[l][m].g + dist_bm[l][m].b;
            total_dist_br[l][m] = dist_br[l][m].r + dist_br[l][m].g + dist_br[l][m].b;

        }

    }
    for (int l = 0; l < 3; l++)
    {
        denoise_interpol_second_inner:for (int m = 0; m < 3; m++)
        {
            dy_l[l][m] = total_dist_tl[l][m] + total_dist_ml[l][m] + total_dist_bl[l][m];
            dy_m[l][m] = total_dist_tm[l][m] + total_dist_mm[l][m] + total_dist_bm[l][m];
            dy_r[l][m] = total_dist_tr[l][m] + total_dist_mr[l][m] + total_dist_br[l][m];
        }
    }
    for (int l = 0; l < 3; l++)
    {
        denoise_interpol_third_inner:for (int m = 0; m < 3; m++)
        {
            dx[l][m] = dy_l[l][m] + dy_m[l][m] + dy_r[l][m];
            //cout << endl<<"dx = " << dx[l][m];
            weights[l][m] = exp(dx[l][m] * inv_sigma_sq);
        }
    }
    nlms.r = 0;
    nlms.g = 0;
    nlms.b = 0;
    nlms_alpha = 0;
    for (int l = 0; l < 3; l++)
    {
        denoise_interpol_fourth_inner:for (int m = 0; m < 3; m++)
        {
            nlms.r = nlms.r + (weights[l][m] * alpha_red[l][m]);
            nlms.g = nlms.g + (weights[l][m] * alpha_green[l][m]);
            nlms.b = nlms.b + (weights[l][m] * alpha_blue[l][m]);
            nlms_alpha = nlms_alpha + (weights[l][m] * 1.0);
        }
    }
    nlm.r = min_def(max_def((nlms.r / nlms_alpha), 0.0f), 1.0f);
    nlm.g = min_def(max_def((nlms.g / nlms_alpha), 0.0f), 1.0f);
    nlm.b = min_def(max_def((nlms.b / nlms_alpha), 0.0f), 1.0f);

    out_pix.r = nlm.r * 255;
    out_pix.g = nlm.g * 255;
    out_pix.b = nlm.b * 255;

    return out_pix;
}
void denoise_filter(AXI_STREAM& INPUT_STREAM, AXI_STREAM& OUTPUT_STREAM, int r, int c)
{
#pragma HLS INTERFACE axis port=INPUT_STREAM
#pragma HLS INTERFACE axis port=OUTPUT_STREAM
#pragma HLS dataflow
    RGB_IMAGE_4 img_in;
    RGB_IMAGE img_out;
    RGB_PIXEL out;
    pixel input_pix;
    pixel window[5][5];
    pixel line_buf[4][516];
//#pragma HLS ARRAY_PARTITION variable=line_buf complete dim=1
    pixel_8 out_pix;
    RGB_PIXEL pin;
    hls::AXIvideo2Mat(INPUT_STREAM, img_in);
    r_uc2float:for(int row = 0; row < r; row++)
    {
#pragma HLS LOOP_TRIPCOUNT min=3 max=260 avg=260
        for(int col = 0; col < c; col ++ )
        {
#pragma HLS LOOP_TRIPCOUNT min=3 max=516 avg=516
#pragma HLS pipeline
            img_in >> pin;
            input_pix.r = float(pin.val[0])/256.0;
            input_pix.g = float(pin.val[1])/256.0;
            input_pix.b = float(pin.val[2])/256.0;
            for (int i = 0; i < 5; i++)
                {
                    window[i][0] = window[i][1];
                    window[i][1] = window[i][2];
                    window[i][2] = window[i][3];
                    window[i][3] = window[i][4];
                }

            window[0][4] = line_buf[0][col];
            window[1][4] = line_buf[0][col] = line_buf[1][col];
            window[2][4] = line_buf[1][col] = line_buf[2][col];
            window[3][4] = line_buf[2][col] = line_buf[3][col];
            window[4][4] = line_buf[3][col] = input_pix;

            if (row > 3 && col > 3 && row < r  && col < c )
            {
                out_pix = denoise_interpol(window);
                out.val[0] = out_pix.r;
                out.val[1] = out_pix.g;
                out.val[2] = out_pix.b;
//cout<<(int)pin.val[0]<<",";
                img_out<<out;

            }

        }
    }
    hls::Mat2AXIvideo(img_out, OUTPUT_STREAM);

}

一旦我达到所需的延迟,我将从定位点表示继续到定点表示,如果它不能用浮动表示来实现,我可以从定点表示开始。

任何建议或想法或高度赞赏,因为我在这一点上有点卡住,无法决定如何进一步进行。

谢谢你

4

0 回答 0