使用 arm-elf-gcc-4.5 -O3 -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp 编译此程序时:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
return vcombine_f32(vuzp_f32(x, y).val[0], z);
}
......这就是我得到的。注意标有@<<< 的两条无用指令
_Z5crossRK19__simd128_float32_tS1_:
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #80 @<<<
vswp d17, d16
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80 @<<<
bx
堆栈永远不会被访问,但堆栈指针会递减,然后以相同的量递增。为什么?
如果我修改原始代码以在序言的末尾和结尾的开头包含 asm 注释,如下所示:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
asm volatile("# End of prologue");
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
asm volatile("# Start of epilogue");
return res;
}
然后我得到稍微不同的版本:
_Z5crossRK19__simd128_float32_tS1_:
sub sp, sp, #80
# End of prologue
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
vswp d17, d16
# Start of epilogue
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80
bx lr
堆栈指针递减/递增显然是序言/结尾的一部分,即使不使用堆栈也会发生。这是为了符合某些标准,还是 gcc 优化错误?
编辑:编译器是 arm-elf-gcc-4.5 (GCC) 4.5.0,配置为:/opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf-gcc/work/gcc-4.5.0/配置 --prefix=/opt/local --infodir=/opt/local/share/info --mandir=/opt/local/share/man --target=arm-elf --program-prefix=arm-elf- --program-suffix=-4.5 --without-included-gettext --enable-obsolete --with-newlib --disable-__cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx -include-dir=/opt/local/arm-elf/include/c++/4.5.0/ --enable-languages=c,c++,objc --build=x86_64-apple-darwin10 --enable-fpu
编辑:我设法使用以下 C 源代码查明问题。仅当使用向量类型的数组作为临时对象时才会发生这种情况,例如声明为的 float32x4x2_t struct { float32x4_t val[2]; }
,即使这些临时对象是寄存器。我相信这是一个错误,所以我报告了它。
#include <arm_neon.h>
// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
return vaddq_f32(*v1, *v2);
#if 0
produced assembly:
add:
vldmia r0, {d16-d17}
vldmia r1, {d18-d19}
vadd.f32 q8, q8, q9
vmov r0, r1, d16
vmov r2, r3, d17
bx lr
#endif
}
// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
float32x4x2_t
xxyyzz1=vzipq_f32(*v1, *v1),
xxyyzz2=vzipq_f32(*v2, *v2);
float32x2_t
xx1=vget_low_f32(xxyyzz1.val[0]),
yy1=vget_high_f32(xxyyzz1.val[0]),
zz1=vget_low_f32(xxyyzz1.val[1]),
xx2=vget_low_f32(xxyyzz2.val[0]),
yy2=vget_high_f32(xxyyzz2.val[0]),
zz2=vget_low_f32(xxyyzz2.val[1]);
float32x2_t
x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);
return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
produced assembly:
cross:
vldmia r0, {d18-d19}
vldmia r1, {d16-d17}
vmov q10, q9
vmov q11, q8
vzip.32 q9, q10
vzip.32 q8, q11
vmov d24, d19
vmov d21, d16
vmov d16, d17
vmul.f32 d19, d20, d21
vmul.f32 d17, d24, d22
vmls.f32 d17, d20, d16
vmls.f32 d19, d18, d22
vmul.f32 d16, d18, d16
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #48 @ here
vswp d17, d16
vmov r0, r1, d16
vmov r2, r3, d17
add sp, sp, #48 @ and here
bx lr
#endif
}