这是我的 SSE 库的一部分。在处理海量数据时,我总是使用 SoA 而不是 SoA。_m128 / _m256的运算符重载使得将 C/C++ 算法转换为 SIMD 变得容易。
库不支持加载/存储,因为 SSE/AVX 对内存操作非常敏感。糟糕的内存访问会导致数十个 CPU 周期并停止计算。
__forceinline __m128 operator+(__m128 l, __m128 r) { return _mm_add_ps(l,r); }
__forceinline __m128 operator-(__m128 l, __m128 r) { return _mm_sub_ps(l,r); }
__forceinline __m128 operator*(__m128 l, __m128 r) { return _mm_mul_ps(l,r); }
__forceinline __m128 operator/(__m128 l, __m128 r) { return _mm_div_ps(l,r); }
__forceinline __m128 operator&(__m128 l, __m128 r) { return _mm_and_ps(l,r); }
__forceinline __m128 operator|(__m128 l, __m128 r) { return _mm_or_ps(l,r); }
__forceinline __m128 operator<(__m128 l, __m128 r) { return _mm_cmplt_ps(l,r); }
__forceinline __m128 operator>(__m128 l, __m128 r) { return _mm_cmpgt_ps(l,r); }
__forceinline __m128 operator<=(__m128 l, __m128 r) { return _mm_cmple_ps(l,r); }
__forceinline __m128 operator>=(__m128 l, __m128 r) { return _mm_cmpge_ps(l,r); }
__forceinline __m128 operator!=(__m128 l, __m128 r) { return _mm_cmpneq_ps(l,r); }
__forceinline __m128 operator==(__m128 l, __m128 r) { return _mm_cmpeq_ps(l,r); }
__forceinline __m128 _mm_merge_ps(__m128 m, __m128 l, __m128 r)
{
return _mm_or_ps(_mm_andnot_ps(m, l), _mm_and_ps(m, r));
}
struct TPoint4
{
TPoint4() {}
TPoint4(const D3DXVECTOR3& a) :x(_mm_set1_ps(a.x)), y(_mm_set1_ps(a.y)), z(_mm_set1_ps(a.z)) {}
TPoint4(__m128 a, __m128 b, __m128 c) :x(a), y(b), z(c) {}
TPoint4(const __m128* a) :x(a[0]), y(a[1]), z(a[2]) {}
TPoint4(const D3DXVECTOR3& a, const D3DXVECTOR3& b, const D3DXVECTOR3& c, const D3DXVECTOR3& d) :x(_mm_set_ps(a.x,b.x,c.x,d.x)), y(_mm_set_ps(a.y,b.y,c.y,d.y)), z(_mm_set_ps(a.z,b.z,c.z,d.z)) {}
operator __m128* () { return &x; }
operator const __m128* () const { return &x; }
TPoint4 operator+(const TPoint4& r) const { return TPoint4(x+r.x, y+r.y, z+r.z); }
TPoint4 operator-(const TPoint4& r) const { return TPoint4(x-r.x, y-r.y, z-r.z); }
TPoint4 operator*(__m128 r) const { return TPoint4(x * r, y * r, z * r); }
TPoint4 operator/(__m128 r) const { return TPoint4(x / r, y / r, z / r); }
__m128 operator[](int index) const { return _val[index]; }
union
{
struct
{
__m128 x, y, z;
};
struct
{
__m128 _val[3];
};
};
};
__forceinline TPoint4* TPoint4Cross(TPoint4* result, const TPoint4* l, const TPoint4* r)
{
result->x = (l->y * r->z) - (l->z * r->y);
result->y = (l->z * r->x) - (l->x * r->z);
result->z = (l->x * r->y) - (l->y * r->x);
return result;
}
__forceinline __m128 TPoint4Dot(const TPoint4* l, const TPoint4* r)
{
return (l->x * r->x) + (l->y * r->y) + (l->z * r->z);
}
__forceinline TPoint4* TPoint4Normalize(TPoint4* result, const TPoint4* l)
{
__m128 rec_len = _mm_rsqrt_ps( (l->x * l->x) + (l->y * l->y) + (l->z * l->z) );
result->x = l->x * rec_len;
result->y = l->y * rec_len;
result->z = l->z * rec_len;
return result;
}
__forceinline __m128 TPoint4Length(const TPoint4* l)
{
return _mm_sqrt_ps( (l->x * l->x) + (l->y * l->y) + (l->z * l->z) );
}
__forceinline TPoint4* TPoint4Merge(TPoint4* result, __m128 mask, const TPoint4* l, const TPoint4* r)
{
result->x = _mm_merge_ps(mask, l->x, r->x);
result->y = _mm_merge_ps(mask, l->y, r->y);
result->z = _mm_merge_ps(mask, l->z, r->z);
return result;
}
extern __m128 g_zero4;
extern __m128 g_one4;
extern __m128 g_fltMax4;
extern __m128 g_mask4;
extern __m128 g_epsilon4;