2

我使用 MS VS 2008 (SP1) 和 SSE 内在函数编写了简单的光线追踪器。当我打开 O2 优化(针对 x86 平台)时,它会加速 3 倍。显然我也想将它用于 x64。不幸的是,对于 x64,它会导致一些奇怪的问题。我把它缩小到一小段代码。

让我们采取以下射线:

__m128 ray = _mm_set_ps( 0.0, 1.0, -1.0, -1.0 );

我想对其进行归一化并计算 [ 1/abs(x), 1/abs(y), 1/abs(z), 0 ]。使用没有 O2 优化的 x64 版本我得到正确的结果:

Normalized abs: [ 0.57735, 0.57735, 0.57735, 0 ]
Result 1/v:     [ 1.73205, 1.73205, 1.73205, 0 ]

使用 O2 我得到:

Normalized abs: [ 0.57735, 0.57735, 0.57735, 0 ]
Result 1/v:     [ 1.73205, 1.73205, 0, 0 ]

对项目设置的不同更改会导致此问题消失或再次出现。例如,“启用链接时代码生成 (/GL)”在此示例中有所帮助,但在整个程序中没有帮助。正如你所看到的,这真的很奇怪。x64 的 O2 是否损坏或我的代码使用了一些禁止的 SSE 构造?这是示例:

雷信息.h

#pragma once

#include <smmintrin.h>

class RayInfo
{
    public:

        RayInfo( const __m128 & ray );

        __m128i m_RayDir;
        __m128 m_Ray;
        __m128 m_RayNotZeroMask;
        __m128 m_RayPosMask;
        __m128 m_RayNegMask;
        __m128 m_AbsRay;
        __m128 m_One_AbsRay;
};

雷信息.cpp

#include "RayInfo.h"

const __m128 onePosF = _mm_set_ps( 0.0, 1.0, 1.0, 1.0 );
const __m128 oneNegF = _mm_set_ps( 0.0, -1.0, -1.0, -1.0 );
const __m128 zeroF = _mm_setzero_ps();
const __m128 aaa = _mm_set_ps1( 123.0 );
const __m128 bbb = _mm_set_ps1( 456.0 );

__m128 normalized( const __m128 & v )
{
    // a0 = xx + yy + zz + ww
    __m128 dotProduct = _mm_dp_ps( v, v, 0xF1 );
    float tmp;
    _MM_EXTRACT_FLOAT( tmp, dotProduct, 0 );
    if ( tmp != 0.0f )
    {
        dotProduct = _mm_sqrt_ss( dotProduct );
        _MM_EXTRACT_FLOAT( tmp, dotProduct, 0 );
        return _mm_div_ps( v, _mm_set_ps1( tmp ) );
    }
    else return _mm_setzero_ps();
}

RayInfo::RayInfo( const __m128 & ray )
{
    m_Ray = normalized( ray );
    m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );
    m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );
    m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );
    m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( aaa, bbb, m_RayPosMask ), m_RayNotZeroMask ) );
    m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );
    m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );
}

主文件

#include <iostream>

#include "RayInfo.h"

int main( int argc, char * argv[] )
{
    __m128 ray = _mm_set_ps( 0.0, 1.0, -1.0, -1.0 );
    RayInfo ri( ray );

    float result[ 4 ];
    _mm_store_ps( result, ri.m_AbsRay );
    std::cout << "Normalized abs: [ " << result[ 0 ] << ", " << result[ 1 ] << ", " << result[ 2 ] << ", " << result[ 3 ] << " ] " << std::endl;
    _mm_store_ps( result, ri.m_One_AbsRay );
    std::cout << "Result 1/v:     [ " << result[ 0 ] << ", " << result[ 1 ] << ", " << result[ 2 ] << ", " << result[ 3 ] << " ] " << std::endl;
}

编辑:

使用 O2 优化的装配输出:

; 27   :    m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );

movaps  xmm4, XMMWORD PTR zeroF

; 28   :    m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );
; 29   :    m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );
; 30   :    m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( neg, pos, m_RayPosMask ), m_RayNotZeroMask ) );   

movaps  xmm2, XMMWORD PTR neg
movaps  xmm0, xmm5

; 31   :    m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );    
; 32   :    m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );   
; 33   : }

mov rax, rcx
cmpltps xmm0, xmm4
movaps  xmm1, xmm4
movaps  xmm3, xmm4
movaps  XMMWORD PTR [rcx+16], xmm5
cmpltps xmm1, xmm5
cmpneqps xmm3, xmm5
movaps  XMMWORD PTR [rcx+48], xmm1
movaps  XMMWORD PTR [rcx+32], xmm3
movaps  XMMWORD PTR [rcx+64], xmm0
blendvps xmm2, XMMWORD PTR pos, xmm1
movaps  xmm1, xmm4
blendvps xmm1, xmm2, xmm3
movss   xmm2, DWORD PTR __real@80000000
movaps  XMMWORD PTR [rcx], xmm1
movaps  xmm1, XMMWORD PTR onePosF
shufps  xmm2, xmm2, 0
andnps  xmm2, xmm5
divps   xmm1, xmm2
movaps  XMMWORD PTR [rcx+80], xmm2
blendvps xmm4, xmm1, xmm3
movaps  XMMWORD PTR [rcx+96], xmm4
add rsp, 24
ret 0

无 O2 的装配输出:

; 27   :    m_RayNotZeroMask = _mm_cmpneq_ps( m_Ray, zeroF );

mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR [rax+16]
cmpneqps xmm0, XMMWORD PTR zeroF
movaps  XMMWORD PTR $T6429[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6429[rsp]
movaps  XMMWORD PTR [rax+32], xmm0

; 28   :    m_RayPosMask = _mm_cmpgt_ps( m_Ray, zeroF );

mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR zeroF
cmpltps xmm0, XMMWORD PTR [rax+16]
movaps  XMMWORD PTR $T6430[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6430[rsp]
movaps  XMMWORD PTR [rax+48], xmm0

; 29   :    m_RayNegMask = _mm_cmplt_ps( m_Ray, zeroF );

mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR [rax+16]
cmpltps xmm0, XMMWORD PTR zeroF
movaps  XMMWORD PTR $T6431[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6431[rsp]
movaps  XMMWORD PTR [rax+64], xmm0

; 30   :    m_RayDir = _mm_castps_si128( _mm_blendv_ps( zeroF, _mm_blendv_ps( neg, pos, m_RayPosMask ), m_RayNotZeroMask ) );   

mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR [rax+48]
movaps  xmm1, XMMWORD PTR neg
blendvps xmm1, XMMWORD PTR pos, xmm0
movaps  XMMWORD PTR $T6432[rsp], xmm1
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR [rax+32]
movaps  xmm1, XMMWORD PTR zeroF
blendvps xmm1, XMMWORD PTR $T6432[rsp], xmm0
movaps  XMMWORD PTR $T6433[rsp], xmm1
movaps  xmm0, XMMWORD PTR $T6433[rsp]
movdqa  XMMWORD PTR $T6434[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movdqa  xmm0, XMMWORD PTR $T6434[rsp]
movdqa  XMMWORD PTR [rax], xmm0

; 31   :    m_AbsRay = _mm_andnot_ps( _mm_set_ps1( -0.0f ), m_Ray );    

movss   xmm0, DWORD PTR __real@80000000
shufps  xmm0, xmm0, 0
movaps  XMMWORD PTR $T6435[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6435[rsp]
andnps  xmm0, XMMWORD PTR [rax+16]
movaps  XMMWORD PTR $T6436[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6436[rsp]
movaps  XMMWORD PTR [rax+80], xmm0

; 32   :    m_One_AbsRay = _mm_blendv_ps( zeroF, _mm_div_ps( onePosF, m_AbsRay ), m_RayNotZeroMask );   

mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR onePosF
divps   xmm0, XMMWORD PTR [rax+80]
movaps  XMMWORD PTR $T6437[rsp], xmm0
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR [rax+32]
movaps  xmm1, XMMWORD PTR zeroF
blendvps xmm1, XMMWORD PTR $T6437[rsp], xmm0
movaps  XMMWORD PTR $T6438[rsp], xmm1
mov rax, QWORD PTR this$[rsp]
movaps  xmm0, XMMWORD PTR $T6438[rsp]
movaps  XMMWORD PTR [rax+96], xmm0
mov rcx, QWORD PTR __$ArrayPad$[rsp]
xor rcx, rsp
call    __security_check_cookie
add rsp, 248                ; 000000f8H
ret 0

编辑:

项目链接: https ://docs.google.com/file/d/0Bz554SfOcR3AS3QxZzJrZEZJelE/edit

4

0 回答 0