c++ - (Vec4 x Mat4x4) 产品使用 SIMD 和改进

Question

我正在编写一个复杂的模拟程序，并且发现最耗时的例程是将四向量（float4）与 4x4 矩阵相乘的例程。我需要在几台或多或少旧的计算机上运行这个程序。这就是为什么我尝试在以下代码中检查此类操作的 SIMD 功能：

//#include <xmmintrin.h> // SSE
//#include <pmmintrin.h> // SSE3
//#include <nmmintrin.h> // SSE4.2
  #include <immintrin.h> // AVX

#include <iostream>
#include <ctime>
#include <string>

using namespace std;

// 4-vector.
typedef struct
{
    float x;
    float y;
    float z;
    float w;
}float4;

// typedef to simplify the pointer of function notation.
typedef void(*Function)(float4&,const float4*,const float4&);

float dot( const float4& in_A, const float4& in_x )
{
    return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS
}

void A_times_x( float4& out_y, const float4* in_A, const float4& in_x )
{
    out_y.x = dot(in_A[0], in_x); // 7 FLOPS
    out_y.y = dot(in_A[1], in_x); // 7 FLOPS
    out_y.z = dot(in_A[2], in_x); // 7 FLOPS
    out_y.w = dot(in_A[3], in_x); // 7 FLOPS
}

void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Transpose the matrix and re-order the vector.
    _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

    __m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0));
    __m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1));
    __m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2));
    __m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS
    __m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS
    __m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS
    __m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS
    __m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m256 xx = _mm256_castps128_ps256(x);
           xx = _mm256_insertf128_ps(xx,x,1);
    __m256 A0 = _mm256_load_ps((const float*)(in_A + 0));
    __m256 A2 = _mm256_load_ps((const float*)(in_A + 2));

    // Multiply each matrix row with the vector x
    __m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS
    __m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS

  /*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0);
    __m128 sum_01 = _mm256_extractf128_ps(sum_00,1);

    __m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);*/

    // Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256)
    _mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0),
                                             _mm256_extractf128_ps(sum_00,1)));
}

void test_function ( Function f, string simd, unsigned int imax )
{
    float4 Y;
    float4 X1 = {0.5,1,0.2,0.7};
    float4 X2 = {0.7,1,0.2,0.5};
    float4 X3 = {0.5,0.2,1,0.7};
    float4 X4 = {1,0.7,0.2,0.5};
    float4 A[4] = {{0.5,1,0.2,0.7},
                   {0.6,0.4,0.1,0.8},
                   {0.3,0.8,0.2,0.5},
                   {1,0.4,0.6,0.9}};

    clock_t tstart = clock();

    for( unsigned int i=0 ; i<imax ; i++ )
    for( unsigned long int j=0 ; j<250000000 ; j++ )
    // Avoid for loop over long long, it is 2 times slower !
    {
        // Function pointer give a real call, whether the direct
        // call is inlined and thus results are overestimated.
        f( Y,A,X1 );
        f( Y,A,X2 );
        f( Y,A,X3 );
        f( Y,A,X4 );
    }

    clock_t tend = clock();

    double diff = static_cast<double>(tend - tstart) * 1e-3;

    cout << "Time  (" << simd << ") = " << diff << " s" << endl;
    cout << "Nops  (" << simd << ") = " << (double) imax << ".10^9" << endl;
    cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std.
    cout << endl;
}

int main ( int argc, char *argv[] )
{
    test_function ( &A_times_x     ,"std" , 1 );
    test_function ( &A_times_x_SSE ,"SSE" , 2 );
    test_function ( &A_times_x_SSE3,"SSE3", 3 );
    test_function ( &A_times_x_SSE4,"SSE4", 1 );
    test_function ( &A_times_x_AVX ,"AVX" , 3 );

    return 0;
}

我对此类问题的改进有一些麻烦。运行代码时，我得到以下结果（Intel Core i5 4670K、3.4GHz、Haswell、Codeblock+MinGW 编译器使用 -O2 -march=corei7-avx）：

Time  (std) = 6.287 s
Nops  (std) = 1.10^9
Power (std) = 4.45363 GFLOPS

Time  (SSE) = 6.661 s
Nops  (SSE) = 2.10^9
Power (SSE) = 8.40715 GFLOPS

Time  (SSE3) = 8.361 s
Nops  (SSE3) = 3.10^9
Power (SSE3) = 10.0466 GFLOPS

Time  (SSE4) = 6.131 s
Nops  (SSE4) = 1.10^9
Power (SSE4) = 4.56695 GFLOPS

Time  (AVX) = 8.767 s
Nops  (AVX) = 3.10^9
Power (AVX) = 9.58138 GFLOPS

我的问题如下：

这是否可以提高性能/加速？对于 SSE，它应该是 x4（最大值），对于 AVX，它应该是 x8。
为什么 AVX 不比 SSE3 快？

对于那些说：“停止使用你的东西，使用英特尔数学核心库”的人，我回答：“我不会，因为我想要一个小的可执行文件，而且我只需要在这种特定情况下使用 SIMD，而不是其他地方”； -)

score 5 · Accepted Answer

这是否可以提高性能/加速？对于 SSE，它应该是 x4（最大值），对于 AVX，它应该是 x8。

是的，我在efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product详细解释了这一点。

M将 4x4 矩阵与列向量u相乘的有效方法v = M u是：

v = u1*col1 + u2*col2 + u3*col3 + u4*col4.

这要求您存储列向量。例如，假设您有以下 4x4 矩阵A：

 0  1  2  3
 4  5  6  7
 8  9 10 11
12 13 14 15

然后你把它存储为

0 4 8 c 1 5 9 d 2 6 a e 3 7 b f

相反，如果你想要行向量uT时间矩阵M，vT = uT*M那么你想要

vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.

在这种情况下，您应该打包行而不是列。

因此，要优化函数中的代码，请A_times_x_SSE注释掉该行

 _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

并且此功能将比使用水平操作的其他功能更快。

SIMD 的水平操作效率不高。由于它们被分解为标量微操作，因此它们不是并行的，因此它们在某些不是 SIMD 中。它们仅在不方便以对 SIMD 友好的形式打包数据时才有用。例如，当您不能存储的列M并且只有行时。

为什么 AVX 不比 SSE3 快？

为了使用 AVX 有效地做到这一点，您必须一次对两个 4x4 矩阵进行操作，并打包矩阵，以便它们对 AVX 友好。现在让我们假设除了A上面定义的矩阵之外，您还有另一个矩阵B：

16 17 18 19
20 21 22 23
24 25 26 27
28 29 30 31

A打包和B用于 AVX的最佳方式是

col1A col1B col2A col2B col3A col3B col4A col4B
0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31

假设您有两个向量u = {0,1,2,3}，v = {4,5,6,7)并且您想要y=Au和z=Bv然后使用 AVX 执行以下操作：

c1 = col1A col1B = {0  4  8 12 16 20 24 28} = _mm256_load_ps
c2 = col2A col2B = {1  5  9 13 17 21 25 29}
c3 = col3A col3B = {2  6 10 14 18 22 26 30}
c4 = col4A col4B = {3  7 11 15 19 23 27 31}
broad1 = {0,0,0,0,4,4,4,4}
broad2 = {1,1,1,1,5,5,5,5}
broad3 = {2,2,2,2,6,6,6,6}
broad4 = {3,3,3,3,7,7,7,7}
w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4;
//w = {y1, y2, y3, y4, z1, z2, z3, z4};

所以得到的 8-wide 向量w包含 4-vectory和z。这是 AVX 最有效的方法。如果你有固定的矩阵和变量向量，你可以在一个循环中运行，那么在运行时在循环之前打包对于一个大循环来说可以忽略不计。如果您知道矩阵在编译时是固定的，那么您可以在编译时打包。

c++ - (Vec4 x Mat4x4) 产品使用 SIMD 和改进

1 回答 1

Related

Reference