下面的函数包含嵌套的 for 循环。其中有 3 个。为了便于理解,我在下面给出了整个函数。我想在最里面的 for 循环中并行化代码,因为它需要最大的 CPU 时间。然后我可以考虑外部 2 for 循环。我可以在最里面的 for 循环中看到依赖项和内部内联函数。是否可以重写最里面的 for 循环以使用 openmp pragma 启用并行化。请告诉如何。我只写了我感兴趣的循环,然后是这个循环存在的完整函数以供参考。
有兴趣并行化下面提到的循环。
//* LOOP WHICH I WANT TO PARALLELIZE *//
for (y = 0; y < 4; y++)
{
refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
}
存在此循环的完整功能如下供参考。/*!
***********************************************************************
* \brief
* Setup the fast search for an macroblock
***********************************************************************
*/
void SetupFastFullPelSearch (short ref, int list) // <-- reference frame parameter, list0 or 1
{
short pmv[2];
pel_t orig_blocks[256], *orgptr=orig_blocks, *refptr, *tem; // created pointer tem
int offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
int LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
int max_width, max_height;
int img_width, img_height;
StorablePicture *ref_picture;
pel_t *ref_pic;
int** block_sad = BlockSAD[list][ref][7];
int search_range = max_search_range[list][ref];
int max_pos = (2*search_range+1) * (2*search_range+1);
int list_offset = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;
int apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
(active_pps->weighted_bipred_idc && (img->type == B_SLICE)));
ref_picture = listX[list+list_offset][ref];
//===== Use weighted Reference for ME ====
if (apply_weights && input->UseWeightedReferenceME)
ref_pic = ref_picture->imgY_11_w;
else
ref_pic = ref_picture->imgY_11;
max_width = ref_picture->size_x - 17;
max_height = ref_picture->size_y - 17;
img_width = ref_picture->size_x;
img_height = ref_picture->size_y;
//===== get search center: predictor of 16x16 block =====
SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);
search_center_x[list][ref] = pmv[0] / 4;
search_center_y[list][ref] = pmv[1] / 4;
if (!input->rdopt)
{
//--- correct center so that (0,0) vector is inside ---
search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));
search_center_y[list][ref] = max(-search_range, min(search_range, search_center_y[list][ref]));
}
search_center_x[list][ref] += img->opix_x;
search_center_y[list][ref] += img->opix_y;
offset_x = search_center_x[list][ref];
offset_y = search_center_y[list][ref];
//===== copy original block for fast access =====
for (y = img->opix_y; y < img->opix_y+16; y++)
for (x = img->opix_x; x < img->opix_x+16; x++)
*orgptr++ = imgY_org [y][x];
//===== check if whole search range is inside image =====
if (offset_x >= search_range && offset_x <= max_width - search_range &&
offset_y >= search_range && offset_y <= max_height - search_range )
{
range_partly_outside = 0; PelYline_11 = FastLine16Y_11;
}
else
{
range_partly_outside = 1;
}
//===== determine position of (0,0)-vector =====
if (!input->rdopt)
{
ref_x = img->opix_x - offset_x;
ref_y = img->opix_y - offset_y;
for (pos = 0; pos < max_pos; pos++)
{
if (ref_x == spiral_search_x[pos] &&
ref_y == spiral_search_y[pos])
{
pos_00[list][ref] = pos;
break;
}
}
}
//===== loop over search range (spiral search): get blockwise SAD =====
**// =====THIS IS THE PART WHERE NESTED FOR STARTS=====**
for (pos = 0; pos < max_pos; pos++) // OUTERMOST FOR LOOP
{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];
if (range_partly_outside)
{
if (abs_y >= 0 && abs_y <= max_height && abs_x >= 0 && abs_x <= max_width )
{
PelYline_11 = FastLine16Y_11;
}
else
{
PelYline_11 = UMVLine16Y_11;
}
}
orgptr = orig_blocks;
bindex = 0;
for (blky = 0; blky < 4; blky++) // SECOND FOR LOOP
{
LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;
for (y = 0; y < 4; y++) //INNERMOST FOR LOOP WHICH I WANT TO PARALLELIZE
{
refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
}
block_sad[bindex++][pos] = LineSadBlk0;
block_sad[bindex++][pos] = LineSadBlk1;
block_sad[bindex++][pos] = LineSadBlk2;
block_sad[bindex++][pos] = LineSadBlk3;
}
}
//===== combine SAD's for larger block types =====
SetupLargerBlocks (list, ref, max_pos);
//===== set flag marking that search setup have been done =====
search_setup_done[list][ref] = 1;
}
#endif // _FAST_FULL_ME_
我重写了代码以尝试解决最内层 for 循环中的依赖关系,即 for(y=0;y<4;y++) 和许多 LineSadBlk。如有错误请评论。我认为 refptr 和 orgptr 已通过此排序并解决了依赖关系,但 LineSadBlk0,1,2,3 仍然具有依赖关系,就好像我们并行运行第一次和第二次迭代一样,LineSadBlk0,1,2,3 的值将是被线程占用。请问这个怎么解决。
/*!
***********************************************************************
* \brief
* Setup the fast search for an macroblock
***********************************************************************
*/
void SetupFastFullPelSearch (short ref, int list) // <-- reference frame parameter, list0 or 1
{
short pmv[2];
pel_t orig_blocks[256];
//pel_t *orgptr, *refptr[4];
pel_t *orgptr[4],*refptr[4]; //defined by me new
int offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
int LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
int max_width, max_height;
int img_width, img_height;
StorablePicture *ref_picture;
pel_t *ref_pic;
int** block_sad = BlockSAD[list][ref][7];
int search_range = max_search_range[list][ref];
int max_pos = (2*search_range+1) * (2*search_range+1);
int list_offset = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;
int apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
(active_pps->weighted_bipred_idc && (img->type == B_SLICE)));
ref_picture = listX[list+list_offset][ref];
//===== Use weighted Reference for ME ====
if (apply_weights && input->UseWeightedReferenceME)
ref_pic = ref_picture->imgY_11_w;
else
ref_pic = ref_picture->imgY_11;
max_width = ref_picture->size_x - 17;
max_height = ref_picture->size_y - 17;
img_width = ref_picture->size_x;
img_height = ref_picture->size_y;
//===== get search center: predictor of 16x16 block =====
SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16); //call 1
search_center_x[list][ref] = pmv[0] / 4;
search_center_y[list][ref] = pmv[1] / 4;
if (!input->rdopt)
{
//--- correct center so that (0,0) vector is inside ---
search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));
search_center_y[list][ref] = max(-search_range, min(search_range, search_center_y[list][ref]));
}
search_center_x[list][ref] += img->opix_x;
search_center_y[list][ref] += img->opix_y;
offset_x = search_center_x[list][ref];
offset_y = search_center_y[list][ref];
// orgptr=orig_blocks;
orgptr[0]= orig_blocks //all org pointers defined orig blocks
orgptr[1]= orig_blocks;
orgptr[2]= orig_blocks;
orgptr[3]= orig_blocks;
//===== copy original block for fast access =====
for (y = img->opix_y; y < img->opix_y+16; y++)
for (x = img->opix_x; x < img->opix_x+16; x++)
{
//*orgptr++ = imgY_org [y][x];
*(orgptr[0])++ = imgY_org [y][x]; // img stored in all orgptr
*(orgptr[1])++ = imgY_org [y][x];
*(orgptr[2])++ = imgY_org [y][x];
*(orgptr[3])++ = imgY_org [y][x];
}
//===== check if whole search range is inside image =====
if (offset_x >= search_range && offset_x <= max_width - search_range &&
offset_y >= search_range && offset_y <= max_height - search_range )
{
range_partly_outside = 0; PelYline_11 = FastLine16Y_11; //search range is fully inside image
}
else
{
range_partly_outside //search range is partly outside image
}
//===== determine position of (0,0)-vector =====
if (!input->rdopt)
{
ref_x = img->opix_x - offset_x;
ref_y = img->opix_y - offset_y;
for (pos = 0; pos < max_pos; pos++)
{
if (ref_x == spiral_search_x[pos] &&
ref_y == spiral_search_y[pos])
{
pos_00[list][ref] = pos;
break;
}
}
}
//===== loop over search range (spiral search): get blockwise SAD =====
for (pos = 0; pos < max_pos; pos++)
{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];
if (range_partly_outside)
{
if (abs_y >= 0 && abs_y <= max_height &&
abs_x >= 0 && abs_x <= max_width )
{
PelYline_11 = FastLine16Y_11; //call 2
}
else
{
PelYline_11 = UMVLine16Y_11; //call 3
}
}
//orgptr=orig_blocks;
orgptr[0]=orig_blocks;
orgptr[1]=orgptr[0]+16;
orgptr[2]=orgptr[1]+16;
orgptr[3]=orgptr[2]+16;
bindex = 0;
for (blky = 0; blky < 4; blky++)
{
LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;
// i added the following to take refptr out of loop
refptr[0] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width); //call either 2 or 3
abs_y++;
refptr[1] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width); //call either 2 or 3
abs_y++;
refptr[2] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width); //call either 2 or 3
abs_y++;
refptr[3] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width); //call either 2 or 3
abs_y++;
omp_set_num_threads(4);
#pragma omp parallel for reduction(+:LineSadBlk0,LineSadBlk1,LineSadBlk2,LineSadBlk3)
for (y = 0; y < 4; y++)
{
{
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
{
LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
{
LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
{
LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
}
}
block_sad[bindex++][pos] = LineSadBlk0;
block_sad[bindex++][pos] = LineSadBlk1;
block_sad[bindex++][pos] = LineSadBlk2;
block_sad[bindex++][pos] = LineSadBlk3;
}
}
//===== combine SAD's for larger block types =====
SetupLargerBlocks (list, ref, max_pos); //call4
//===== set flag marking that search setup have been done =====
search_setup_done[list][ref] = 1;
}
#endif // _FAST_FULL_ME_