performance - 如何优化这个 OpenCL Sobel 过滤器内核？

Question

我为 3x3 Sobel 滤波器编写了一个 OpenCL 内核，目前它在 2k x 2k 图像上以大约 17 毫秒的速度运行。这并不像我希望的那么快。有人对如何提高速度有任何建议吗？我已经遵循了优化内核清单上的大部分建议。我的处理器是 Intel i5-3450。工作组大小为 8x8，工作项的数量为高 x 宽 / 16，在我运行的图像上为 2048 x 128。

__kernel void localCacheSobelFilter(
  const __global char16* src, 
  __write_only __global float16* angle, 
  __write_only __global float16* mag, 
  const int width, 
  const int height)
{
  // Cache the data we're looking at in __local space
  const int row = get_global_id(0);
  const int col = get_global_id(1);

  const int cacheRow = get_local_id(0) + 1;
  const int cacheCol = get_local_id(1) + 1;

  __local char16 cache[BLOCK_SIZE + 2][BLOCK_SIZE + 2];


  cache[cacheRow][cacheCol] = src[ indexOf(row, col) ];

  // --- Deal with the boundary conditions
  // This adds in the rows above and below the local block,
  // ignoring the corners.
  const bool atTopRow = (cacheRow == 1);
  const bool atBottomRow = (cacheRow == BLOCK_SIZE);

  if(atTopRow) {
    cache[0][cacheCol] = src[ indexOf(row - 1, col) ];

  } else if (atBottomRow) {
    cache[BLOCK_SIZE + 1][cacheCol] = src[ indexOf(row + 1, col) ];
  }


  // This adds in the columns to the left and right of the local block,
  // ignoring the corners.
  const bool atLeftCol = (cacheCol == 1);
  const bool atRightCol = (cacheCol == BLOCK_SIZE);

  if(atLeftCol) { 
    cache[cacheRow][0].sf = src[ indexOf(row, col - 1) ].sf;

  } else if (atRightCol) {
    cache[cacheRow][BLOCK_SIZE + 1].s0 = src[ indexOf(row, col + 1) ].s0;
  }


  // Now finally check the corners
  const bool atTLCorner = atTopRow && atLeftCol;
  const bool atTRCorner = atTopRow && atRightCol;
  const bool atBLCorner = atBottomRow && atLeftCol;
  const bool atBRCorner = atBottomRow && atRightCol;


  if(atTLCorner) {
    cache[0][0].sf = src[ indexOf(row - 1, col - 1) ].sf;

  } else if (atTRCorner) { 
    cache[0][BLOCK_SIZE + 1].s0 = src[ indexOf(row - 1, col + 1) ].s0;

  } else if (atBLCorner) {
    cache[BLOCK_SIZE + 1][0].sf = src[ indexOf(row + 1, col - 1) ].sf;

  } else if (atBRCorner) {
    cache[BLOCK_SIZE + 1][BLOCK_SIZE + 1].s0 = src[ indexOf(row + 1, col + 1) ].s0;
  }

  barrier(CLK_LOCAL_MEM_FENCE); 

  //===========================================================================
  // Do the calculation

  //  [..., pix00]  upperRow  [pix02, ...]
  //  [..., pix10]  centerRow [pix12, ...]
  //  [..., pix20]  lowerRow  [pix22, ...]
  const char pix00 = cache[cacheRow - 1][cacheCol - 1].sf;
  const char pix10 = cache[cacheRow    ][cacheCol - 1].sf;
  const char pix20 = cache[cacheRow + 1][cacheCol - 1].sf;

  const char16 upperRow  = cache[cacheRow - 1][cacheCol];    
  const char16 centerRow = cache[cacheRow    ][cacheCol];
  const char16 lowerRow  = cache[cacheRow + 1][cacheCol];

  const char pix02 = cache[cacheRow - 1][cacheCol + 1].s0;
  const char pix12 = cache[cacheRow    ][cacheCol + 1].s0;              
  const char pix22 = cache[cacheRow + 1][cacheCol + 1].s0;



  // Do the calculations for Gy
  const char16 upperRowShiftLeft = (char16)(upperRow.s123456789abcdef, pix02);
  const char16 upperRowShiftRight = (char16)(pix00, upperRow.s0123456789abcde);

  const char16 lowerRowShiftLeft = (char16)(lowerRow.s123456789abcdef, pix22);
  const char16 lowerRowShiftRight = (char16)(pix20, lowerRow.s0123456789abcde);

  const float16 Gy = convert_float16(
    (upperRowShiftLeft + 2 * upperRow + upperRowShiftRight)
    - (lowerRowShiftLeft + 2 * lowerRow + lowerRowShiftRight));


  // Do the calculations for Gx
  const char16 centerRowShiftLeft = (char16)(centerRow.s123456789abcdef, pix12);
  const char16 centerRowShiftRight = (char16)(pix10, centerRow.s0123456789abcde);

  const float16 Gx = convert_float16(
    (upperRowShiftRight + 2 * centerRowShiftRight + lowerRowShiftRight)
    - (upperRowShiftLeft + 2 * centerRowShiftLeft + lowerRowShiftLeft));


  // Find the angle and magnitude
  angle[ indexOf(row, col) ] = 0.0; //atan2(Gy, Gx);
  mag[ indexOf(row, col) ] = ALPHA * max(Gx, Gy) + BETA * min(Gx, Gy);
}

任何帮助将不胜感激。谢谢！

score 2 · Accepted Answer

在您的内核中，您有很多“如果”来避免边缘效应。但是对于 99% 的时间来说，这是耗时且无用的。我认为您可以更改 NDRangeKernel 中的全局工作大小并使用偏移量来避免边缘效应。

例如：偏移 = {1,1,0} GlobalWorkSize = { width-2, height-2,0}

performance - 如何优化这个 OpenCL Sobel 过滤器内核？

1 回答 1

Related

Reference