c++ - 需要帮助优化代码（最小图像约定）

Question

我已经编写了一些模拟代码，并且正在使用“在 GDB 中随机中断”的调试方法。我发现我的程序 99.9% 的时间都花在了这个例程中（这是最小的图像约定）：

inline double distanceSqPeriodic(double const * const position1, double const * const position2, double boxWidth) {
        double xhw, yhw, zhw, x, y, z;                                                                                 

        xhw = boxWidth / 2.0;                                                                                      
        yhw = xhw;                                                                                      
        zhw = xhw;                                                                                      

        x = position2[0] - position1[0];                                                                               
        if (x > xhw) 
                x -= boxWidth;                                                                                     
        else if (x < -xhw)
                x += boxWidth;                                                                                     


        y = position2[1] - position1[1];                                                                               
        if (y > yhw) 
                y -= boxWidth;                                                                                     
        else if (y < -yhw)
                y += boxWidth;                                                                                     


        z = position2[2] - position1[2];                                                                               
        if (z > zhw) 
                z -= boxWidth;                                                                                     
        else if (z < -zhw)
                z += boxWidth;                                                                                     

        return x * x + y * y + z * z;                                                                                  
}

到目前为止我执行的优化（可能不是很重要）：

返回距离的平方而不是平方根
内联它
尽我所能
没有标准库膨胀
使用我能想到的每个 g++ 优化标志进行编译

我已经没有什么可以做的了。也许我可以使用浮点数而不是双精度数，但我希望这是最后的手段。也许我可以以某种方式使用 SIMD，但我从来没有这样做过，所以我想这是很多工作。有任何想法吗？

谢谢

score 2 · Accepted Answer

首先，您没有使用正确的算法。如果两个点之间的距离大于 boxWidth 怎么办？其次，如果您有多个粒子，调用一个函数来执行所有距离计算并将结果放在输出缓冲区中会显着提高效率。内联有助于减少其中的一些，但不是全部。任何预先计算——比如在你的算法中将盒子长度除以 2——都会在不需要的时候重复。

这是一些用于计算的 SIMD 代码。您需要使用 -msse4 进行编译。在我的机器（macbook pro，llvm-gcc-4.2）上使用 -O3，我的速度提高了大约 2 倍。这确实需要使用 32 位浮点数而不是双精度算术。

SSE 真的没那么复杂，只是看起来很糟糕。例如，您必须编写笨拙的_mm_mul_ps(a,b)，而不是编写a*b。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <smmintrin.h>

// you can compile this code with -DDOUBLE to try using doubles vs. floats
// in the unoptimized code. The SSE code uses only floats.
#ifdef DOUBLE
typedef double real;
#else
typedef float real;
#endif

static inline __m128 loadFloat3(const float const* value) {
  // Load (x,y,z) into a SSE register, leaving the last entry
  // set to zero.
  __m128 x = _mm_load_ss(&value[0]);
  __m128 y = _mm_load_ss(&value[1]);
  __m128 z = _mm_load_ss(&value[2]);
  __m128 xy = _mm_movelh_ps(x, y);
  return _mm_shuffle_ps(xy, z, _MM_SHUFFLE(2, 0, 2, 0));
}

int fdistanceSqPeriodic(float* position1, float* position2, const float boxWidth,
                          float* out, const int n_points) {
  int i;
  __m128 r1, r2, r12, s12, r12_2, s, box, invBox;

  box = _mm_set1_ps(boxWidth);
  invBox = _mm_div_ps(_mm_set1_ps(1.0f), box);

  for (i = 0; i < n_points; i++) {
    r1 = loadFloat3(position1);
    r2 = loadFloat3(position1);
    r12 = _mm_sub_ps(r1, r2);

    s12 = _mm_mul_ps(r12, invBox);
    s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT));
    r12 = _mm_mul_ps(box, s12);

    r12_2 = _mm_mul_ps(r12, r12);

    // double horizontal add instruction accumulates the sum of
    // all four elements into each of the elements
    // (e.g. s.x = s.y = s.z = s.w =  r12_2.x + r12_2.y + r12_2.z + r12_2.w)
    s = _mm_hadd_ps(r12_2, r12_2);
    s = _mm_hadd_ps(s, s);

    _mm_store_ss(out++, s);
    position1 += 3;
    position2 += 3;
  }
  return 1;
}

inline real distanceSqPeriodic(real const * const position1, real const * const position2, real boxWidth) {
  real xhw, yhw, zhw, x, y, z;                                                                                 

  xhw = boxWidth / 2.0;                                                                                      
  yhw = xhw;                                                                                      
  zhw = xhw;                                                                                      

  x = position2[0] - position1[0];                                                                               
  if (x > xhw) 
    x -= boxWidth;                                                                                     
  else if (x < -xhw)
    x += boxWidth;                                                                                     


  y = position2[1] - position1[1];                                                                               
  if (y > yhw) 
    y -= boxWidth;                                                                                     
  else if (y < -yhw)
    y += boxWidth;                                                                                     


  z = position2[2] - position1[2];                                                                               
  if (z > zhw) 
    z -= boxWidth;                                                                                     
  else if (z < -zhw)
    z += boxWidth;                                                                                     

  return x * x + y * y + z * z;                                                                                  
}


int main(void) {
  real* position1;
  real* position2;
  real* output;
  int n_runs = 10000000;
  posix_memalign((void**) &position1, 16, n_runs*3*sizeof(real));
  posix_memalign((void**) &position2, 16, n_runs*3*sizeof(real));
  posix_memalign((void**) &output, 16, n_runs*sizeof(real));
  real boxWidth = 1.8;
  real result = 0;
  int i;
  clock_t t;

#ifdef OPT
  printf("Timing optimized SSE implementation\n");
#else
  printf("Timinig original implementation\n");
#endif

#ifdef DOUBLE
  printf("Using double precision\n");
#else
  printf("Using single precision\n");
#endif

  t = clock();
#ifdef OPT
  fdistanceSqPeriodic(position1, position2, boxWidth, output, n_runs);
#else
  for (i = 0; i < n_runs; i++) {
    *output = distanceSqPeriodic(position1, position2,  boxWidth);
    position1 += 3;
    position2 += 3;
    output++;
  }
#endif
  t = clock() - t;

  printf("It took me %d clicks (%f seconds).\n", (int) t, ((float)t)/CLOCKS_PER_SEC);
}

score 1 · Accepted Answer

您可能希望使用 fabs（在 ISO 90 C 中标准化），因为这应该能够简化为单个非分支指令。

score 0 · Accepted Answer

Return the square of the distance instead of the square root

只要您将正方形与正方形进行比较，这是一个好主意。

Inline it

这有时是一种反优化：内联代码占用执行管道/缓存中的空间，无论它是否分支。

通常它没有区别，因为编译器对是否内联有最终决定权。

Const what I can

通常没有任何区别。

No standard library bloat

什么肿？

Compiling with every g++ optimization flag I can think of

这很好：将大部分优化留给编译器。仅当您测量了真正的瓶颈并确定该瓶颈是否严重时，才可以在手头上投入资金进行优化。

您可以尝试做的是使您的代码无分支。如果不使用位掩码，这可能如下所示：

//if (z > zhw) 
//        z -= boxWidths[2];
//else if (z < -zhw)
//      z += boxWidths[2]; 

const auto z_a[] = {
    z,
    z - boxWidths[2]
};
z = z_a[z>zhw];
...

或者

z -= (z>zhw) * boxWidths[2];

但是，不能保证这会更快。您的编译器现在可能更难识别代码中的 SIMD 点，或者分支目标缓冲区做得很好，并且大多数时候您的函数具有相同的代码路径。

score 0 · Accepted Answer

您需要摆脱比较，因为这些比较难以预测。

要实现的功能是：

     /    /           /\  /\
    /    /           /  \/  \
    ----0-----  or ------------ , as (-x)^2 == x^2
       /    / 
      /    /

后者是两个 abs 语句的结果：

 x = abs(half-abs(diff))+half;

编码

double tst(double a[4], double b[4], double half)
{
   double sum=0.0,t;
   int i;
   for (i=0;i<3;i++) { t=fabs(fabs(b[i]-a[i])-half)-half; sum+=t*t;}
   return sum;
}

比原来的实现高出四倍（+一些）——此时甚至没有完全并行性：只使用了 xmm 寄存器的下半部分。

通过 x && y 的并行处理，可以实现大约 50% 的理论增益。从理论上讲，使用浮点数而不是双精度数可以使其速度提高约 3 倍。

c++ - 需要帮助优化代码（最小图像约定）

4 回答 4

Related

Reference