assembly - 一些 intersectRaySphere c 程序通过将其重写为 x86 asm 进行优化（如何？）

Question

你好，我对汇编的了解不多，我正在考虑如何通过在 x86（32 位 fpu 或 sse2）汇编中重写它来优化它，应该优化它 - 用正确的汇编重写，然后我会测试如果我加快了速度（length() 和 dot() 也应该在这里用 asm 编写）此代码由我的简单实时光线追踪器使用，它可以工作 - 但我在 asm 优化方面不太擅长）

    inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
   {
    static float3 Q;

    Q = sub(sO,rO);
    float c = length(&Q);
    float v = dot(&Q,rV);
    float d = sR*sR - (c*c - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);

    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

先感谢您

//编辑

    struct float3
    {
     float x;
     float y;
     float z;
    };


    inline float length(float3* v) {
     return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
    }

   inline float dot(float3* a, float3* b) {
     return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
   }

和演示 exe（在没有那么多优化的 c 中未优化）：

dl.dropbox.com/u/42887985/re29.zip

也许有人可以给我一个有点好的 fpu asm 例程，用于长度点（或标准化未在此处显示）？虽然相交过程的整个过程是最好的;-)

score 2 · Accepted Answer

这不是转换为 SSE 的“好”功能。实际上几乎没有什么是平行的。因此，让我们更改函数以一次与 4 条射线相交。如果光线也存储在SOA（数组结构）而不是 AOS（结构数组）中，这将有所帮助。

通过这些更改，它可能会变成这样（未以任何方式测试）：

inline void intersect4RaysSphere(
 float* rOx, float* rOy, float* rOz,
 float* rVx, float* rVy, float* rVz,
 float sOx, float sOy, float sOz,
 float sR)
{
    // calculate Q
    movss xmm0, sOx
    movss xmm1, sOy
    movss xmm2, sOz
    shufps xmm0, xmm0, 0
    shufps xmm1, xmm1, 0
    shufps xmm2, xmm2, 0
    subps xmm0, [rOx]
    subps xmm1, [rOy]
    subps xmm2, [rOz]
    // calculate pow(dot(Q, rV), 2) in xmm3
    movaps xmm3, [rVx]
    movaps xmm4, [rVy]
    movaps xmm5, [rVz]
    mulps xmm3, xmm0
    mulps xmm4, xmm1
    mulps xmm5, xmm2
    addps xmm3, xmm4
    addps xmm3, xmm5
    movaps xmm4, xmm3
    mulps xmm3, xmm3
    // calculate pow(length(Q), 2)
    // there's no point in taking the square root only to then square it
    mulps xmm0, xmm0
    mulps xmm1, xmm1
    mulps xmm2, xmm2
    addps xmm0, xmm1
    addps xmm0, xmm2
    // calculate d
    movss xmm1, sR
    mulss xmm1, xmm1
    shufps xmm1, xmm1, 0
    subps xmm0, xmm3
    subps xmm1, xmm0
    sqrtps xmm1, xmm1
    // test for intersection
    // at this point:
    // xmm3 = v * v
    // xmm4 = v
    // xmm1 = sqrt(d)
    movaps xmm0, [minus1]  // memory location with { -1.0, -1.0, -1.0, -1.0 }
    subps xmm4, xmm1
    // get a mask of d's smaller than 0.0
    psrad xmm1, 31
    // select -1 if less than zero or v*v - d if >= 0
    andps xmm0, xmm1
    andnps xmm1, xmm4
    orps xmm0, xmm1
    ret
}

具有内在函数的版本（仅经过轻微测试 - 它是可编译的，并且似乎可以生成 OK 程序集）：

__m128 intersect4RaysSphere(
     float* rOx, float* rOy, float* rOz,
     float* rVx, float* rVy, float* rVz,
     float sOx, float sOy, float sOz,
     float sR)
{
    __m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
    __m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
    __m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
    __m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
               _mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
                          _mm_mul_ps(Qz, _mm_load_ps(rVz))));
    __m128 vsquared = _mm_mul_ps(v, v);
    __m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
                            _mm_add_ps(_mm_mul_ps(Qy, Qy),
                                       _mm_mul_ps(Qz, Qz)));
    __m128 sr = _mm_set1_ps(sR);
    __m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
    __m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
    //__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
    __m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
    return result;
}

score 2 · Accepted Answer

__asm
    {
    movaps xmm0,[float3] //this is vector of yours into xmm0
    mulps xmm0,xmm0       //this is each term squared
    pxor xmm1,xmm1       //clean xmm1 first
    movlhps xmm1,xmm0    //lower 2 terms to the higher 2 parts of xmm1
    addps xmm0,xmm1      //higher 2 terms of xmm0 now has x_square+z_square and  y_square + zero_square
    shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
    addps xmm0,xmm2     //now we have sum of all squares in highest of xmm0
    shufps xmm0,xmm0,11111111b // copy result to all 4 parts
    sqrtss xmm0,xmm0           //scalar square-root
    movaps [result],xmm0
    }

这可能比完全优化慢，但对于向量长度计算应该足够快。需要对齐 16 字节的向量。如果您不想对齐，请将 movaps 更改为 movups。如果您可以使此代码正常工作，那么您可以通过放置来进一步提高性能

align 16

在 movaps xmm0,[float3] 的开头使代码也对齐。然后您可以检查每个指令有多少字节。尝试达到最佳码长（16 字节的倍数）。在 sse2(sse3,sse4,avx) 之后，有垂直水平向量指令，只需要 1 条指令即可获得结果。

在第二条指令中将 mm0,xmm0 编辑为 xmm0,xmm0

这是一些清单：

http://softpixel.com/~cwright/programming/simd/sse2.php

assembly - 一些 intersectRaySphere c 程序通过将其重写为 x86 asm 进行优化（如何？）

2 回答 2

Related

Reference