35

考虑以下两个片段:

#define ALIGN_BYTES 32
#define ASSUME_ALIGNED(x) x = __builtin_assume_aligned(x, ALIGN_BYTES)

void fn0(const float *restrict a0, const float *restrict a1,
         float *restrict b, int n)
{
    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

void fn1(const float *restrict *restrict a, float *restrict b, int n)
{
    ASSUME_ALIGNED(a[0]); ASSUME_ALIGNED(a[1]); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a[0][i] + a[1][i];
}

当我编译该函数时,gcc-4.7.2 -Ofast -march=native -std=c99 -ftree-vectorizer-verbose=5 -S test.c -Wall我发现 GCC 为第二个函数插入了别名检查。

我怎样才能防止这种情况,使得生成的程序集与 forfn1的程序集相同fn0?(当参数的数量从 3 个增加到 30 个时,参数传递方法 ( fn0) 变得很麻烦,并且该fn1方法中的别名检查数量变得荒谬。)

组装(x86-64,支持 AVX 的芯片);在 .LFB10 处混叠杂物

fn0:
.LFB9:
    .cfi_startproc
    testl   %ecx, %ecx
    jle .L1
    movl    %ecx, %r10d
    shrl    $3, %r10d
    leal    0(,%r10,8), %r9d
    testl   %r9d, %r9d
    je  .L8
    cmpl    $7, %ecx
    jbe .L8
    xorl    %eax, %eax
    xorl    %r8d, %r8d
    .p2align 4,,10
    .p2align 3
.L4:
    vmovaps (%rsi,%rax), %ymm0
    addl    $1, %r8d
    vaddps  (%rdi,%rax), %ymm0, %ymm0
    vmovaps %ymm0, (%rdx,%rax)
    addq    $32, %rax
    cmpl    %r8d, %r10d
    ja  .L4
    cmpl    %r9d, %ecx
    je  .L1
.L3:
    movslq  %r9d, %rax
    salq    $2, %rax
    addq    %rax, %rdi
    addq    %rax, %rsi
    addq    %rax, %rdx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L6:
    vmovss  (%rsi,%rax,4), %xmm0
    vaddss  (%rdi,%rax,4), %xmm0, %xmm0
    vmovss  %xmm0, (%rdx,%rax,4)
    addq    $1, %rax
    leal    (%r9,%rax), %r8d
    cmpl    %r8d, %ecx
    jg  .L6
.L1:
    vzeroupper
    ret
.L8:
    xorl    %r9d, %r9d
    jmp .L3
    .cfi_endproc
.LFE9:
    .size   fn0, .-fn0
    .p2align 4,,15
    .globl  fn1
    .type   fn1, @function
fn1:
.LFB10:
    .cfi_startproc
    testq   %rdx, %rdx
    movq    (%rdi), %r8
    movq    8(%rdi), %r9
    je  .L12
    leaq    32(%rsi), %rdi
    movq    %rdx, %r10
    leaq    32(%r8), %r11
    shrq    $3, %r10
    cmpq    %rdi, %r8
    leaq    0(,%r10,8), %rax
    setae   %cl
    cmpq    %r11, %rsi
    setae   %r11b
    orl %r11d, %ecx
    cmpq    %rdi, %r9
    leaq    32(%r9), %r11
    setae   %dil
    cmpq    %r11, %rsi
    setae   %r11b
    orl %r11d, %edi
    andl    %edi, %ecx
    cmpq    $7, %rdx
    seta    %dil
    testb   %dil, %cl
    je  .L19
    testq   %rax, %rax
    je  .L19
    xorl    %ecx, %ecx
    xorl    %edi, %edi
    .p2align 4,,10
    .p2align 3
.L15:
    vmovaps (%r9,%rcx), %ymm0
    addq    $1, %rdi
    vaddps  (%r8,%rcx), %ymm0, %ymm0
    vmovaps %ymm0, (%rsi,%rcx)
    addq    $32, %rcx
    cmpq    %rdi, %r10
    ja  .L15
    cmpq    %rax, %rdx
    je  .L12
    .p2align 4,,10
    .p2align 3
.L20:
    vmovss  (%r9,%rax,4), %xmm0
    vaddss  (%r8,%rax,4), %xmm0, %xmm0
    vmovss  %xmm0, (%rsi,%rax,4)
    addq    $1, %rax
    cmpq    %rax, %rdx
    ja  .L20
.L12:
    vzeroupper
    ret
.L19:
    xorl    %eax, %eax
    jmp .L20
    .cfi_endproc
4

4 回答 4

1

可以告诉编译器停止检查别名:

请添加行:

#pragma GCC ivdep

就在您要矢量化的循环前面,如果您需要更多信息,请阅读:

https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/Loop-Specific-Pragmas.html

于 2015-01-19T12:13:51.107 回答
0

Well, what about the flag

-fno-strict-aliasing

?

As I understood you right you just want to know how to turn this checks off? If thats all, this parameter to gcc commandline should be helping you.

EDIT:

In addition to your comment: isn't it forbidden to use const type restrict pointers?

this is from ISO/IEC 9899 (6.7.3.1 Formal definition of restrict):

1.

Let D be a declaration of an ordinary identifier that provides a means of designating an object P as a restrict-qualified pointer to type T.

4.

During each execution of B, let L be any lvalue that has &L based on P. If L is used to access the value of the object X that it designates, and X is also modified (by any means), then the following requirements apply: T shall not be const-qualified. Every other lvalue used to access the value of X shall also have its address based on P. Every access that modifies X shall be considered also to modify P, for the purposes of this subclause. If P is assigned the value of a pointer expression E that is based on another restricted pointer object P2, associated with block B2, then either the execution of B2 shall begin before the execution of B, or the execution of B2 shall end prior to the assignment. If these requirements are not met, then the behavior is undefined.

And a much more interesting point, same as with register is this one:

6.

A translator is free to ignore any or all aliasing implications of uses of restrict.

So if you can't find a command parameter which forces gcc to do so, its probably not possible, because from the standard it doesn't have to give the option to do so.

于 2013-08-09T08:02:51.647 回答
0

这有帮助吗?

void fn1(const float **restrict a, float *restrict b, int n)
{
    const float * restrict a0 = a[0];
    const float * restrict a1 = a[1];

    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

编辑:第二次尝试:)。来自http://locklessinc.com/articles/vectorize/的信息

gcc --fast-math ...

于 2013-03-25T12:33:41.070 回答
0

我提前道歉,因为我无法在我的机器上使用 GCC 4.7 重现结果,但有两种可能的解决方案。

  1. 使用 typedef* restrict * restrict正确组合。根据开发 LLVM 编译器的前同事的说法,这是typedef在 C 中表现得像预处理器的唯一例外,它的存在是为了允许您想要的抗锯齿行为。

    我在下面尝试了这个,但我不确定我是否成功。请仔细检查我的尝试。

  2. 使用将限制限定符与 C99 可变长度数组 (VLA) 一起使用的答案中描述的语法。

    我在下面尝试了这个,但我不确定我是否成功。请仔细检查我的尝试。

这是我用来执行实验的代码,但我无法最终确定我的任何一个建议是否按预期工作。

#define ALIGN_BYTES 32
#define ASSUME_ALIGNED(x) x = __builtin_assume_aligned(x, ALIGN_BYTES)

void fn0(const float *restrict a0, const float *restrict a1,
         float *restrict b, int n)
{
    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

#if defined(ARRAY_RESTRICT)
void fn1(const float *restrict a[restrict], float * restrict b, int n)
#elif defined(TYPEDEF_SOLUTION)
typedef float * restrict frp;
void fn1(const frp *restrict a, float *restrict b, int n)
#else
void fn1(const float *restrict *restrict a, float *restrict b, int n)
#endif
{
    //ASSUME_ALIGNED(a[0]); ASSUME_ALIGNED(a[1]); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a[0][i] + a[1][i];
}

再次,我为这个答案半生不熟的性质道歉。请不要因为尝试但没有成功而对我投反对票。

于 2015-04-28T21:50:42.347 回答