我有这个循环这个函数:
Mat HessianDetector::hessianResponse(const Mat &inputImage, float norm)
{
//...
const float *in = inputImage.ptr<float>(1);
Mat outputImage(rows, cols, CV_32FC1);
float *out = outputImage.ptr<float>(1) + 1;
//...
for (int r = 1; r < rows - 1; ++r)
{
float v11, v12, v21, v22, v31, v32;
v11 = in[-stride]; v12 = in[1 - stride];
v21 = in[ 0]; v22 = in[1 ];
v31 = in[+stride]; v32 = in[1 + stride];
in += 2;
for (int c = 1; c < cols - 1; ++c, in++, out++)
{
/* fetch remaining values (last column) */
const float v13 = in[-stride];
const float v23 = *in;
const float v33 = in[+stride];
// compute 3x3 Hessian values from symmetric differences.
float Lxx = (v21 - 2*v22 + v23);
float Lyy = (v12 - 2*v22 + v32);
float Lxy = (v13 - v11 + v31 - v33)/4.0f;
/* normalize and write out */
*out = (Lxx * Lyy - Lxy * Lxy)*norm2;
/* move window */
v11=v12; v12=v13;
v21=v22; v22=v23;
v31=v32; v32=v33;
/* move input/output pointers */
}
out += 2;
}
return outputImage;
}
这被称为:
#pragma omp for collapse(2) schedule(dynamic)
for(int i=0; i<levels; i++)
for (int j = 1; j <= scaleCycles; j++)
{
int scaleCyclesLevel = scaleCycles * i;
float curSigma = par.sigmas[j];
hessResps[j+scaleCyclesLevel] = hessianResponse(blurs[j+scaleCyclesLevel], curSigma*curSigma);
}
特别是,Intel Advisor 表示内部循环非常耗时,应该进行矢量化:
for (int c = 1; c < cols - 1; ++c, in++, out++)
但是,它还说在这两行存在读取后写入依赖性:
读:
float Lyy = (v12 - 2*v22 + v32);
写:
hessResps[j+scaleCyclesLevel] = hessianResponse(blurs[j+scaleCyclesLevel], curSigma*curSigma);
但我真的不明白为什么会发生这种情况(即使我知道 RAW 依赖的含义)。
这是优化报告:
LOOP BEGIN at /home/luca/Dropbox/HKUST/CloudCache/cloudcache/CloudCache/Descriptors/hesaff/pyramid.cpp(92,7)
remark #17104: loop was not parallelized: existence of parallel dependence
remark #17106: parallel dependence: assumed ANTI dependence between *(in+cols*4) (95:28) and *out (105:11)
remark #17106: parallel dependence: assumed FLOW dependence between *out (105:11) and *(in+cols*4) (95:28)
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed ANTI dependence between *(in+cols*4) (95:28) and *out (105:11)
remark #15346: vector dependence: assumed FLOW dependence between *out (105:11) and *(in+cols*4) (95:28)
LOOP END
第 95 行是:
const float v13 = in[-stride];
第 105 行是:
*out = (Lxx * Lyy - Lxy * Lxy)*norm2;