0

我正在开发一个需要自动矢量化大循环的项目。必须使用 GCC 编译。问题的最小情况可能如下:

#define VLEN 4
#define NTHREADS 4
#define AVX512_ALIGNMENT 64

#define NUM_INTERNAL_ITERS 5

#define real double

typedef struct private_data {
    /*
     * Alloc enough space for private data and MEM_BLOCK_SIZE bytes of padding.
     * Private data must be allocated all at once to squeeze cache performance by only
     * padding once per CPU.
     */
    real *contiguous_data;

    /*
     * Pointers to corresponding index in contiguous_data.
     */
    real *array_1;
    real *array_2;
} private_data_t;

private_data_t private_data[NTHREADS];

int num_iter;

void minimum_case(const int thread) {

    // Reference to thread private data.

    real *restrict array_1 =
        __builtin_assume_aligned(private_data[thread].array_1, AVX512_ALIGNMENT);

    real *restrict array_2 =
        __builtin_assume_aligned(private_data[thread].array_2, AVX512_ALIGNMENT);

    for (int i = 0; i < num_iter; i++) {
        for (int k = 0; k < NUM_INTERNAL_ITERS; ++k) {

            int array_1_entry =
                (k * (NUM_INTERNAL_ITERS) * VLEN) +
                i * NUM_INTERNAL_ITERS * NUM_INTERNAL_ITERS * VLEN;

            int array_2_entry =
                (k * (NUM_INTERNAL_ITERS) * VLEN) +
                i * NUM_INTERNAL_ITERS * VLEN;

#pragma GCC unroll 1
#pragma GCC ivdep
            for (int j = 0; j < VLEN; j++) {

                real pivot;

                int a_idx = array_1_entry + VLEN * 0 + j;
                int b_idx = array_1_entry + VLEN * 1 + j;
                int c_idx = array_1_entry + VLEN * 2 + j;
                int d_idx = array_1_entry + VLEN * 3 + j;

                int S_idx = array_2_entry + VLEN * 0 + j;

                if (k == 0) {

                    pivot = array_1[a_idx];

                    // b = b / a
                    array_1[b_idx] /= pivot;

                    // c = c / a
                    array_1[c_idx] /= pivot;

                    // d = d / a
                    array_1[d_idx] /= pivot;

                    // S = S / a
                    array_2[S_idx] /= pivot;
                }

                int e_idx = array_1_entry + VLEN * 4 + j;
                int f_idx = array_1_entry + VLEN * 5 + j;
                int g_idx = array_1_entry + VLEN * 6 + j;
                int k_idx = array_1_entry + VLEN * 7 + j;

                int T_idx = array_2_entry + VLEN * 1 + j;

                pivot = array_1[e_idx];

                // f = f - (e * b)
                array_1[f_idx] -= array_1[b_idx]
                                  * pivot;

                // g = g - (e * c)
                array_1[g_idx] -= array_1[c_idx]
                                  * pivot;

                // k = k - (e * d)
                array_1[k_idx] -= array_1[d_idx]
                                  * pivot;

                // T = T - (e * S)
                array_2[T_idx] -= array_2[S_idx]
                                  * pivot;
            }
        }
    }
}

对于这种特定情况,GCC 使用 16B 向量而不是 32B 向量进行自动向量化。很容易看出控制流依赖于一个可以从内部循环中检查出来的条件,但是 GCC 没有执行任何循环取消切换。

循环取消切换可以手动完成,但请注意,这是问题的最小情况,真正的循环有数百行,执行手动取消循环会导致大量代码冗余。我试图找到一种方法来强制 GCC 为可以从内部循环中检查出来的不同条件创建不同的循环。

目前我正在使用带有以下标志的 GCC 9.2:-Ofast -march=native -std=c11 -fopenmp -ftree-vectorize -ffast-math -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -fopt-info-vec-optimized

4

1 回答 1

0

特别是如果真正的循环有数百行,我强烈建议将其分解为一个单独的函数——这将使手动取消切换(在必要时)不会那么糟糕。

以下应该等同于您的代码(注意,我还考虑了一些索引计算 - 这实际上可以进一步简化):

inline void inner_loop(real *restrict array_1, real *restrict array_2,
                       int const first) {
#pragma GCC unroll 1
  for (int j = 0; j < VLEN; j++) {
    int a_idx = VLEN * 0 + j;
    int b_idx = VLEN * 1 + j;
    int c_idx = VLEN * 2 + j;
    int d_idx = VLEN * 3 + j;

    int S_idx = VLEN * 0 + j;

    if (first) {
      real pivot = array_1[a_idx];

      array_1[b_idx] /= pivot;      // b = b / a
      array_1[c_idx] /= pivot;      // c = c / a
      array_1[d_idx] /= pivot;      // d = d / a
      array_2[S_idx] /= pivot;      // S = S / a
    }

    int e_idx = VLEN * 4 + j;
    int f_idx = VLEN * 5 + j;
    int g_idx = VLEN * 6 + j;
    int k_idx = VLEN * 7 + j;

    int T_idx = VLEN * 1 + j;

    real pivot = array_1[e_idx];

    array_1[f_idx] -= array_1[b_idx] * pivot;    // f = f - (e * b)
    array_1[g_idx] -= array_1[c_idx] * pivot;    // g = g - (e * c)
    array_1[k_idx] -= array_1[d_idx] * pivot;    // k = k - (e * d)
    array_2[T_idx] -= array_2[S_idx] * pivot;    // T = T - (e * S)
  }
}

void minimum_case(const int thread) {
  // Reference to thread private data.

  real *restrict array_1 =
      __builtin_assume_aligned(private_data[thread].array_1, AVX512_ALIGNMENT);

  real *restrict array_2 =
      __builtin_assume_aligned(private_data[thread].array_2, AVX512_ALIGNMENT);

  for (int i = 0; i < num_iter; i++) {
    real *array_1_i =
        array_1 + i * NUM_INTERNAL_ITERS * NUM_INTERNAL_ITERS * VLEN;
    real *array_2_i = array_2 + i * NUM_INTERNAL_ITERS * VLEN;

    inner_loop(array_1_i, array_2_i, 0);
    for (int k = 1; k < NUM_INTERNAL_ITERS; ++k) {
      int array_1_entry = (k * (NUM_INTERNAL_ITERS)*VLEN);

      int array_2_entry = (k * (NUM_INTERNAL_ITERS)*VLEN);
      inner_loop(array_1_i + array_1_entry, array_2_i + array_2_entry, 1);
    }
  }
}

Godbolt 的完整演示:https ://godbolt.org/z/wMgSnr

于 2020-04-12T00:36:40.183 回答