我有一个令人不安的问题,使用 g++ 内在函数实现一个简单的非并行平均方法。这里是相关的c++部分,其中p_oNumPoints是数组p_pValues的长度,它本身就是一个double*类型的函数参数。
inline void arithmeticAvg (double &p_rAvg, const unsigned int p_oNumPoints, const double *p_pValues, double &p_rSum)
{
unsigned int i;
double oTmp[2] __attribute__((aligned(16))) = {0,0};
register __m128d XMM0 = _mm_setzero_pd();
register __m128d XMM1;
register __m128d XMM2;
const unsigned int oLooplength = (p_oNumPoints / 4);
for (i=0; i < (oLooplength*4); i += 4)
{
XMM1 = _mm_loadu_pd(&p_pValues[i]);
XMM2 = _mm_loadu_pd(&p_pValues[i+2]);
XMM0 = _mm_add_pd(XMM0, XMM1);
XMM0 = _mm_add_pd(XMM0, XMM2);
}
_mm_storeu_pd(oTmp, XMM0);
p_rAvg = oTmp[0] + oTmp[1];
gcc,使用选项 -O2 -S -msse4.1,提供以下程序集
b8b3c29e: xorpd %xmm0,%xmm0
"b8b3c2a2: movapd %xmm0,-0xa8(%ebp)"
b8b3c2aa: mov 0xc(%ebp),%eax
b8b3c2ad: shr $0x2,%eax
b8b3c2b0: mov %eax,-0x10(%ebp)
b8b3c2b3: movl $0x0,-0xc(%ebp)
b8b3c2ba: jmp 0xb8b3c35c <_ZN8precitec4math13arithmeticAvgERdjPKdS1_+252>
b8b3c2bf: mov -0xc(%ebp),%eax
b8b3c2c2: shl $0x3,%eax
b8b3c2c5: add 0x10(%ebp),%eax
b8b3c2c8: mov %eax,-0x14(%ebp)
b8b3c2cb: mov -0x14(%ebp),%eax
b8b3c2ce: movupd (%eax),%xmm0
"b8b3c2d2: movapd %xmm0,-0xb8(%ebp)"
b8b3c2da: mov -0xc(%ebp),%eax
b8b3c2dd: add $0x2,%eax
b8b3c2e0: shl $0x3,%eax
b8b3c2e3: add 0x10(%ebp),%eax
b8b3c2e6: mov %eax,-0x18(%ebp)
b8b3c2e9: mov -0x18(%ebp),%eax
b8b3c2ec: movupd (%eax),%xmm0
b8b3c2f0: movapd %xmm0,-0xc8(%ebp)
b8b3c2f8: movapd -0xa8(%ebp),%xmm0
b8b3c300: movapd %xmm0,-0x28(%ebp)
b8b3c305: movapd -0xb8(%ebp),%xmm0
b8b3c30d: movapd %xmm0,-0x38(%ebp)
b8b3c312: movapd -0x38(%ebp),%xmm0
b8b3c317: movapd -0x28(%ebp),%xmm1
b8b3c31c: addpd %xmm1,%xmm0
b8b3c320: movapd %xmm0,-0xa8(%ebp)
b8b3c328: movapd -0xa8(%ebp),%xmm0
b8b3c330: movapd %xmm0,-0x48(%ebp)
b8b3c335: movapd -0xc8(%ebp),%xmm0
b8b3c33d: movapd %xmm0,-0x58(%ebp)
b8b3c342: movapd -0x58(%ebp),%xmm0
b8b3c347: movapd -0x48(%ebp),%xmm1
b8b3c34c: addpd %xmm1,%xmm0
b8b3c350: movapd %xmm0,-0xa8(%ebp)
尽管循环部分的翻译很糟糕(我尝试使用asm("xmm")属性但没有成功),但这仍然在引号中的行处崩溃。调试时,执行 _mm_setzero_pd() 命令时会发生这种情况,我认为这是由于重新排序所致。
VS 2010 翻译这个非常不同而不会崩溃任何东西。
知道问题是什么以及如何解决问题吗?
谢谢你的帮助,G。