我已经在 vivado hls 中编写了一个非局部均值去噪算法并尝试对其进行优化,目前,我保留了浮点表示,因为它是问题描述的一部分,我需要检查浮点表示中的细节和然后继续进行定点表示。我正在尝试各种 pragma,如流水线、数组分区等,但实现的延迟约为 880 毫秒,这对于图像处理应用程序来说并不是空闲的。通过使用基本的编译指示,我能够获得 5 毫秒,但它显示 fpga 板用完了 DSP 和触发器和 LUT 等其他资源也相当高(我选择了可用的最大开发板)。
我对 Vivado hls 比较陌生,想听听一些关于如何拥有完美管道 (II =1) 以减少延迟(以浮点表示形式)并同时花费更少的想法片上资源的数量。
我附上了下面的代码,
//header
struct pixel
{
float r;
float g;
float b;
};
struct pixel_8
{
uint8_t r;
uint8_t g;
uint8_t b;
};
typedef hls::Scalar<3, unsigned char> RGB_PIXEL;
//core
#include "denoise_h.hpp"
#define min_def(x,y) ( (x)>(y) ? (y) : (x) )
#define max_def(x,y) ( (x)>(y) ? (x) : (y) )
//using namespace std;
pixel_8 denoise_interpol(pixel win[5][5])
{
//#pragma HLS PIPELINE
//#pragma HLS ARRAY_RESHAPE variable=win complete dim=1
pixel dist_tl[3][3], dist_tm[3][3], dist_tr[3][3], dist_ml[3][3], dist_mm[3][3], dist_mr[3][3], dist_bl[3][3], dist_bm[3][3], dist_br[3][3];
pixel_8 out_pix;
float total_dist_tl[3][3], total_dist_tm[3][3], total_dist_tr[3][3], total_dist_ml[3][3], total_dist_mm[3][3], total_dist_mr[3][3], total_dist_bl[3][3], total_dist_bm[3][3], total_dist_br[3][3];
float dy_l[3][3], dy_m[3][3], dy_r[3][3];
float dx[3][3], weights[3][3];
float alpha_red[3][3], alpha_green[3][3], alpha_blue[3][3], alpha_alpha[3][3];
pixel nlms;
float nlms_alpha;
pixel nlm;
int search_area = 3;
int patch_size = 3;//dx
float sigma = (float)0.12;//dy
float inv_sigma_sq = -1.0f / (sigma*sigma*patch_size*patch_size);
denoise_interpol_first_loop:for (int l = 0; l < 3; l++)
{
denoise_interpol_first_loop_inner:for (int m = 0; m < 3; m++)
{
//storing values in alpha window
alpha_red[l][m] = win[l + 1][m + 1].r;
alpha_green[l][m] = win[l + 1][m + 1].g;
alpha_blue[l][m] = win[l + 1][m + 1].b;
alpha_alpha[l][m] = 1.0;
//red
dist_tl[l][m].r = (win[l][m].r - win[1][1].r) * (win[l][m].r - win[1][1].r);
dist_tm[l][m].r = (win[l][m + 1].r - win[1][2].r) * (win[l][m + 1].r - win[1][2].r);
dist_tr[l][m].r = (win[l][m + 2].r - win[1][3].r) * (win[l][m + 2].r - win[1][3].r);
dist_ml[l][m].r = (win[l + 1][m].r - win[2][1].r) * (win[l + 1][m].r - win[2][1].r);
dist_mm[l][m].r = (win[l + 1][m + 1].r - win[2][2].r) * (win[l + 1][m + 1].r - win[2][2].r);
dist_mr[l][m].r = (win[l + 1][m + 2].r - win[2][3].r) * (win[l + 1][m + 2].r - win[2][3].r);
dist_bl[l][m].r = (win[l + 2][m].r - win[3][1].r) * (win[l + 2][m].r - win[3][1].r);
dist_bm[l][m].r = (win[l + 2][m + 1].r - win[3][2].r) * (win[l + 2][m + 1].r - win[3][2].r);
dist_br[l][m].r = (win[l + 2][m + 2].r - win[3][3].r) * (win[l + 2][m + 2].r - win[3][3].r);
//green
dist_tl[l][m].g = (win[l][m].g - win[1][1].g) * (win[l][m].g - win[1][1].g);
dist_tm[l][m].g = (win[l][m + 1].g - win[1][2].g) * (win[l][m + 1].g - win[1][2].g);
dist_tr[l][m].g = (win[l][m + 2].g - win[1][3].g) * (win[l][m + 2].g - win[1][3].g);
dist_ml[l][m].g = (win[l + 1][m].g - win[2][1].g) * (win[l + 1][m].g - win[2][1].g);
dist_mm[l][m].g = (win[l + 1][m + 1].g - win[2][2].g) * (win[l + 1][m + 1].g - win[2][2].g);
dist_mr[l][m].g = (win[l + 1][m + 2].g - win[2][3].g) * (win[l + 1][m + 2].g - win[2][3].g);
dist_bl[l][m].g = (win[l + 2][m].g - win[3][1].g) * (win[l + 2][m].g - win[3][1].g);
dist_bm[l][m].g = (win[l + 2][m + 1].g - win[3][2].g) * (win[l + 2][m + 1].g - win[3][2].g);
dist_br[l][m].g = (win[l + 2][m + 2].g - win[3][3].g) * (win[l + 2][m + 2].g - win[3][3].g);
//blue
dist_tl[l][m].b = (win[l][m].b - win[1][1].b) * (win[l][m].b - win[1][1].b);
dist_tm[l][m].b = (win[l][m + 1].b - win[1][2].b) * (win[l][m + 1].b - win[1][2].b);
dist_tr[l][m].b = (win[l][m + 2].b - win[1][3].b) * (win[l][m + 2].b - win[1][3].b);
dist_ml[l][m].b = (win[l + 1][m].b - win[2][1].b) * (win[l + 1][m].b - win[2][1].b);
dist_mm[l][m].b = (win[l + 1][m + 1].b - win[2][2].b) * (win[l + 1][m + 1].b - win[2][2].b);
dist_mr[l][m].b = (win[l + 1][m + 2].b - win[2][3].b) * (win[l + 1][m + 2].b - win[2][3].b);
dist_bl[l][m].b = (win[l + 2][m].b - win[3][1].b) * (win[l + 2][m].b - win[3][1].b);
dist_bm[l][m].b = (win[l + 2][m + 1].b - win[3][2].b) * (win[l + 2][m + 1].b - win[3][2].b);
dist_br[l][m].b = (win[l + 2][m + 2].b - win[3][3].b) * (win[l + 2][m + 2].b - win[3][3].b);
//caluclating distances
total_dist_tl[l][m] = dist_tl[l][m].r + dist_tl[l][m].g + dist_tl[l][m].b;
total_dist_tm[l][m] = dist_tm[l][m].r + dist_tm[l][m].g + dist_tm[l][m].b;
total_dist_tr[l][m] = dist_tr[l][m].r + dist_tr[l][m].g + dist_tr[l][m].b;
total_dist_ml[l][m] = dist_ml[l][m].r + dist_ml[l][m].g + dist_ml[l][m].b;
total_dist_mm[l][m] = dist_mm[l][m].r + dist_mm[l][m].g + dist_mm[l][m].b;
total_dist_mr[l][m] = dist_mr[l][m].r + dist_mr[l][m].g + dist_mr[l][m].b;
total_dist_bl[l][m] = dist_bl[l][m].r + dist_bl[l][m].g + dist_bl[l][m].b;
total_dist_bm[l][m] = dist_bm[l][m].r + dist_bm[l][m].g + dist_bm[l][m].b;
total_dist_br[l][m] = dist_br[l][m].r + dist_br[l][m].g + dist_br[l][m].b;
}
}
for (int l = 0; l < 3; l++)
{
denoise_interpol_second_inner:for (int m = 0; m < 3; m++)
{
dy_l[l][m] = total_dist_tl[l][m] + total_dist_ml[l][m] + total_dist_bl[l][m];
dy_m[l][m] = total_dist_tm[l][m] + total_dist_mm[l][m] + total_dist_bm[l][m];
dy_r[l][m] = total_dist_tr[l][m] + total_dist_mr[l][m] + total_dist_br[l][m];
}
}
for (int l = 0; l < 3; l++)
{
denoise_interpol_third_inner:for (int m = 0; m < 3; m++)
{
dx[l][m] = dy_l[l][m] + dy_m[l][m] + dy_r[l][m];
//cout << endl<<"dx = " << dx[l][m];
weights[l][m] = exp(dx[l][m] * inv_sigma_sq);
}
}
nlms.r = 0;
nlms.g = 0;
nlms.b = 0;
nlms_alpha = 0;
for (int l = 0; l < 3; l++)
{
denoise_interpol_fourth_inner:for (int m = 0; m < 3; m++)
{
nlms.r = nlms.r + (weights[l][m] * alpha_red[l][m]);
nlms.g = nlms.g + (weights[l][m] * alpha_green[l][m]);
nlms.b = nlms.b + (weights[l][m] * alpha_blue[l][m]);
nlms_alpha = nlms_alpha + (weights[l][m] * 1.0);
}
}
nlm.r = min_def(max_def((nlms.r / nlms_alpha), 0.0f), 1.0f);
nlm.g = min_def(max_def((nlms.g / nlms_alpha), 0.0f), 1.0f);
nlm.b = min_def(max_def((nlms.b / nlms_alpha), 0.0f), 1.0f);
out_pix.r = nlm.r * 255;
out_pix.g = nlm.g * 255;
out_pix.b = nlm.b * 255;
return out_pix;
}
void denoise_filter(AXI_STREAM& INPUT_STREAM, AXI_STREAM& OUTPUT_STREAM, int r, int c)
{
#pragma HLS INTERFACE axis port=INPUT_STREAM
#pragma HLS INTERFACE axis port=OUTPUT_STREAM
#pragma HLS dataflow
RGB_IMAGE_4 img_in;
RGB_IMAGE img_out;
RGB_PIXEL out;
pixel input_pix;
pixel window[5][5];
pixel line_buf[4][516];
//#pragma HLS ARRAY_PARTITION variable=line_buf complete dim=1
pixel_8 out_pix;
RGB_PIXEL pin;
hls::AXIvideo2Mat(INPUT_STREAM, img_in);
r_uc2float:for(int row = 0; row < r; row++)
{
#pragma HLS LOOP_TRIPCOUNT min=3 max=260 avg=260
for(int col = 0; col < c; col ++ )
{
#pragma HLS LOOP_TRIPCOUNT min=3 max=516 avg=516
#pragma HLS pipeline
img_in >> pin;
input_pix.r = float(pin.val[0])/256.0;
input_pix.g = float(pin.val[1])/256.0;
input_pix.b = float(pin.val[2])/256.0;
for (int i = 0; i < 5; i++)
{
window[i][0] = window[i][1];
window[i][1] = window[i][2];
window[i][2] = window[i][3];
window[i][3] = window[i][4];
}
window[0][4] = line_buf[0][col];
window[1][4] = line_buf[0][col] = line_buf[1][col];
window[2][4] = line_buf[1][col] = line_buf[2][col];
window[3][4] = line_buf[2][col] = line_buf[3][col];
window[4][4] = line_buf[3][col] = input_pix;
if (row > 3 && col > 3 && row < r && col < c )
{
out_pix = denoise_interpol(window);
out.val[0] = out_pix.r;
out.val[1] = out_pix.g;
out.val[2] = out_pix.b;
//cout<<(int)pin.val[0]<<",";
img_out<<out;
}
}
}
hls::Mat2AXIvideo(img_out, OUTPUT_STREAM);
}
一旦我达到所需的延迟,我将从定位点表示继续到定点表示,如果它不能用浮动表示来实现,我可以从定点表示开始。
任何建议或想法或高度赞赏,因为我在这一点上有点卡住,无法决定如何进一步进行。
谢谢你