我使用 MS VS 2008 (SP1) 和 SSE 内在函数编写了简单的光线追踪器。当我打开 O2 优化(针对 x86 平台)时,它会加速 3 倍。显然我也想将它用于 x64。不幸的是,对于 x64,它会导致一些奇怪的问题。我把它缩小到一小段代码。
让我们采取以下射线:
__m128 ray = _mm_set_ps( 0.0, 1.0, -1.0, -1.0 );
我想对其进行归一化并计算 [ 1/abs(x), 1/abs(y), 1/abs(z), 0 ]。使用没有 O2 优化的 x64 版本我得到正确的结果:
Normalized abs: [ 0.57735, 0.57735, 0.57735, 0 ]
Result 1/v: [ 1.73205, 1.73205, 1.73205, 0 ]
使用 O2 我得到:
Normalized abs: [ 0.57735, 0.57735, 0.57735, 0 ]
Result 1/v: [ 1.73205, 1.73205, 0, 0 ]
对项目设置的不同更改会导致此问题消失或再次出现。例如,“启用链接时代码生成 (/GL)”在此示例中有所帮助,但在整个程序中没有帮助。正如你所看到的,这真的很奇怪。x64 的 O2 是否损坏或我的代码使用了一些禁止的 SSE 构造?这是示例:
雷信息.h
#pragma once
#include <smmintrin.h>
class RayInfo
{
public:
RayInfo( const __m128 & ray );
__m128i m_RayDir;
__m128 m_Ray;
__m128 m_RayNotZeroMask;
__m128 m_RayPosMask;
__m128 m_RayNegMask;
__m128 m_AbsRay;
__m128 m_One_AbsRay;
};
雷信息.cpp
#include "RayInfo.h"
const __m128 onePosF = _mm_set_ps( 0.0, 1.0, 1.0, 1.0 );
const __m128 oneNegF = _mm_set_ps( 0.0, -1.0, -1.0, -1.0 );
const __m128 zeroF = _mm_setzero_ps();
const __m128 aaa = _mm_set_ps1( 123.0 );
const __m128 bbb = _mm_set_ps1( 456.0 );
__m128 normalized( const __m128 & v )
{
// a0 = xx + yy + zz + ww
__m128 dotProduct = _mm_dp_ps( v, v, 0xF1 );
float tmp;
_MM_EXTRACT_FLOAT( tmp, dotProduct, 0 );
if ( tmp != 0.0f )
{
dotProduct = _mm_sqrt_ss( dotProduct );
_MM_EXTRACT_FLOAT( tmp, dotProduct, 0 );
return _mm_div_ps( v, _mm_set_ps1( tmp ) );
}
else return _mm_setzero_ps();
}
RayInfo::RayInfo( const __m128 & ray )
{
m_Ray = normalized( ray );
m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );
m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );
m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );
m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( aaa, bbb, m_RayPosMask ), m_RayNotZeroMask ) );
m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );
m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );
}
主文件
#include <iostream>
#include "RayInfo.h"
int main( int argc, char * argv[] )
{
__m128 ray = _mm_set_ps( 0.0, 1.0, -1.0, -1.0 );
RayInfo ri( ray );
float result[ 4 ];
_mm_store_ps( result, ri.m_AbsRay );
std::cout << "Normalized abs: [ " << result[ 0 ] << ", " << result[ 1 ] << ", " << result[ 2 ] << ", " << result[ 3 ] << " ] " << std::endl;
_mm_store_ps( result, ri.m_One_AbsRay );
std::cout << "Result 1/v: [ " << result[ 0 ] << ", " << result[ 1 ] << ", " << result[ 2 ] << ", " << result[ 3 ] << " ] " << std::endl;
}
编辑:
使用 O2 优化的装配输出:
; 27 : m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );
movaps xmm4, XMMWORD PTR zeroF
; 28 : m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );
; 29 : m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );
; 30 : m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( neg, pos, m_RayPosMask ), m_RayNotZeroMask ) );
movaps xmm2, XMMWORD PTR neg
movaps xmm0, xmm5
; 31 : m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );
; 32 : m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );
; 33 : }
mov rax, rcx
cmpltps xmm0, xmm4
movaps xmm1, xmm4
movaps xmm3, xmm4
movaps XMMWORD PTR [rcx+16], xmm5
cmpltps xmm1, xmm5
cmpneqps xmm3, xmm5
movaps XMMWORD PTR [rcx+48], xmm1
movaps XMMWORD PTR [rcx+32], xmm3
movaps XMMWORD PTR [rcx+64], xmm0
blendvps xmm2, XMMWORD PTR pos, xmm1
movaps xmm1, xmm4
blendvps xmm1, xmm2, xmm3
movss xmm2, DWORD PTR __real@80000000
movaps XMMWORD PTR [rcx], xmm1
movaps xmm1, XMMWORD PTR onePosF
shufps xmm2, xmm2, 0
andnps xmm2, xmm5
divps xmm1, xmm2
movaps XMMWORD PTR [rcx+80], xmm2
blendvps xmm4, xmm1, xmm3
movaps XMMWORD PTR [rcx+96], xmm4
add rsp, 24
ret 0
无 O2 的装配输出:
; 27 : m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR [rax+16]
cmpneqps xmm0, XMMWORD PTR zeroF
movaps XMMWORD PTR $T6429[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6429[rsp]
movaps XMMWORD PTR [rax+32], xmm0
; 28 : m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR zeroF
cmpltps xmm0, XMMWORD PTR [rax+16]
movaps XMMWORD PTR $T6430[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6430[rsp]
movaps XMMWORD PTR [rax+48], xmm0
; 29 : m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR [rax+16]
cmpltps xmm0, XMMWORD PTR zeroF
movaps XMMWORD PTR $T6431[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6431[rsp]
movaps XMMWORD PTR [rax+64], xmm0
; 30 : m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( neg, pos, m_RayPosMask ), m_RayNotZeroMask ) );
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR [rax+48]
movaps xmm1, XMMWORD PTR neg
blendvps xmm1, XMMWORD PTR pos, xmm0
movaps XMMWORD PTR $T6432[rsp], xmm1
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR [rax+32]
movaps xmm1, XMMWORD PTR zeroF
blendvps xmm1, XMMWORD PTR $T6432[rsp], xmm0
movaps XMMWORD PTR $T6433[rsp], xmm1
movaps xmm0, XMMWORD PTR $T6433[rsp]
movdqa XMMWORD PTR $T6434[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movdqa xmm0, XMMWORD PTR $T6434[rsp]
movdqa XMMWORD PTR [rax], xmm0
; 31 : m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );
movss xmm0, DWORD PTR __real@80000000
shufps xmm0, xmm0, 0
movaps XMMWORD PTR $T6435[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6435[rsp]
andnps xmm0, XMMWORD PTR [rax+16]
movaps XMMWORD PTR $T6436[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6436[rsp]
movaps XMMWORD PTR [rax+80], xmm0
; 32 : m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR onePosF
divps xmm0, XMMWORD PTR [rax+80]
movaps XMMWORD PTR $T6437[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR [rax+32]
movaps xmm1, XMMWORD PTR zeroF
blendvps xmm1, XMMWORD PTR $T6437[rsp], xmm0
movaps XMMWORD PTR $T6438[rsp], xmm1
mov rax, QWORD PTR this$[rsp]
movaps xmm0, XMMWORD PTR $T6438[rsp]
movaps XMMWORD PTR [rax+96], xmm0
mov rcx, QWORD PTR __$ArrayPad$[rsp]
xor rcx, rsp
call __security_check_cookie
add rsp, 248 ; 000000f8H
ret 0
编辑:
项目链接: https ://docs.google.com/file/d/0Bz554SfOcR3AS3QxZzJrZEZJelE/edit