我正在开发一个需要自动矢量化大循环的项目。必须使用 GCC 编译。问题的最小情况可能如下:
#define VLEN 4
#define NTHREADS 4
#define AVX512_ALIGNMENT 64
#define NUM_INTERNAL_ITERS 5
#define real double
typedef struct private_data {
/*
* Alloc enough space for private data and MEM_BLOCK_SIZE bytes of padding.
* Private data must be allocated all at once to squeeze cache performance by only
* padding once per CPU.
*/
real *contiguous_data;
/*
* Pointers to corresponding index in contiguous_data.
*/
real *array_1;
real *array_2;
} private_data_t;
private_data_t private_data[NTHREADS];
int num_iter;
void minimum_case(const int thread) {
// Reference to thread private data.
real *restrict array_1 =
__builtin_assume_aligned(private_data[thread].array_1, AVX512_ALIGNMENT);
real *restrict array_2 =
__builtin_assume_aligned(private_data[thread].array_2, AVX512_ALIGNMENT);
for (int i = 0; i < num_iter; i++) {
for (int k = 0; k < NUM_INTERNAL_ITERS; ++k) {
int array_1_entry =
(k * (NUM_INTERNAL_ITERS) * VLEN) +
i * NUM_INTERNAL_ITERS * NUM_INTERNAL_ITERS * VLEN;
int array_2_entry =
(k * (NUM_INTERNAL_ITERS) * VLEN) +
i * NUM_INTERNAL_ITERS * VLEN;
#pragma GCC unroll 1
#pragma GCC ivdep
for (int j = 0; j < VLEN; j++) {
real pivot;
int a_idx = array_1_entry + VLEN * 0 + j;
int b_idx = array_1_entry + VLEN * 1 + j;
int c_idx = array_1_entry + VLEN * 2 + j;
int d_idx = array_1_entry + VLEN * 3 + j;
int S_idx = array_2_entry + VLEN * 0 + j;
if (k == 0) {
pivot = array_1[a_idx];
// b = b / a
array_1[b_idx] /= pivot;
// c = c / a
array_1[c_idx] /= pivot;
// d = d / a
array_1[d_idx] /= pivot;
// S = S / a
array_2[S_idx] /= pivot;
}
int e_idx = array_1_entry + VLEN * 4 + j;
int f_idx = array_1_entry + VLEN * 5 + j;
int g_idx = array_1_entry + VLEN * 6 + j;
int k_idx = array_1_entry + VLEN * 7 + j;
int T_idx = array_2_entry + VLEN * 1 + j;
pivot = array_1[e_idx];
// f = f - (e * b)
array_1[f_idx] -= array_1[b_idx]
* pivot;
// g = g - (e * c)
array_1[g_idx] -= array_1[c_idx]
* pivot;
// k = k - (e * d)
array_1[k_idx] -= array_1[d_idx]
* pivot;
// T = T - (e * S)
array_2[T_idx] -= array_2[S_idx]
* pivot;
}
}
}
}
对于这种特定情况,GCC 使用 16B 向量而不是 32B 向量进行自动向量化。很容易看出控制流依赖于一个可以从内部循环中检查出来的条件,但是 GCC 没有执行任何循环取消切换。
循环取消切换可以手动完成,但请注意,这是问题的最小情况,真正的循环有数百行,执行手动取消循环会导致大量代码冗余。我试图找到一种方法来强制 GCC 为可以从内部循环中检查出来的不同条件创建不同的循环。
目前我正在使用带有以下标志的 GCC 9.2:-Ofast -march=native -std=c11 -fopenmp -ftree-vectorize -ffast-math -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -fopt-info-vec-optimized