c++ - 值得使用 SSE 还是应该只依赖编译器？

Question

我正在研究很棒的 SSE 指令，并开始使用一些简单的代码来测量使用它们的函数与使用“标准”代码（即非 SSE）的相同函数之间的差异。我意识到，当我编译代码（使用 -O3 标志）时，使用 SSE 版本的函数实际上（非常轻微）“慢”于不使用 SSE 指令的程序版本。我的猜测是：

编译器在优化代码方面做得很好
SSE 函数可以运行得更快，但是将浮点数加载到寄存器是有代价的，这抵消了使用 SSE 指令的好处。
testSSE() 函数不够复杂，无法真正显示使用 SSE 的程序版本与不使用 SSE 的程序版本之间的差异。

谁能告诉我他/她对此有何看法？非常感谢 -

编辑：所以我更正了代码（见下面的 2 个代码清单）。即使使用更短的更正版本，SSE 版本给我 2''48 而非 SSE 版本给我 1''36，证实了这样一个事实，在这种情况下编译器比我做得更好！

编辑：带有错误的旧代码（请参阅下面的更正版本）

// compiled with c++ tmp.cpp -msse4 -o testSSE -O3

#include <iostream>
#include <cmath>

#include <stdio.h>
#include <pmmintrin.h>

inline void testSSE(float *node1, float *node2, float *node3, float *node4, float *result)
{
    __m128 tmp0, tmp1, tmp2, tmp3;
    __m128 l, r;

    l = _mm_load_ps(node1);         //_mm_store_ps(result, l); fprintf(stderr, "1 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    r = _mm_load_ps(node1 + 4);     //_mm_store_ps(result, r); fprintf(stderr, "2 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    tmp0 = _mm_hadd_ps(l, r);       //_mm_store_ps(result, tmp0); fprintf(stderr, "3 %f %f %f %f\n", result[0], result[1], result[2], result[3]);

    l = _mm_load_ps(node2);         //_mm_store_ps(result, l); fprintf(stderr, "4 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    r = _mm_load_ps(node2 + 4);     //_mm_store_ps(result, r); fprintf(stderr, "5 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    tmp1 = _mm_hadd_ps(l, r);       //_mm_store_ps(result, tmp0); fprintf(stderr, "6 %f %f %f %f\n", result[0], result[1], result[2], result[3]);

    l = _mm_load_ps(node3);
    r = _mm_load_ps(node3 + 4);
    tmp2 = _mm_hadd_ps(l, r);

    l = _mm_load_ps(node4);         //_mm_store_ps(result, l); fprintf(stderr, "10 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    r = _mm_load_ps(node4 + 4);     //_mm_store_ps(result, r); fprintf(stderr, "11 %f %f %f %f\n", result[0], result[1], result[2], result[3]);
    tmp3 = _mm_hadd_ps(l, r);       //_mm_store_ps(result, tmp0); fprintf(stderr, "12 %f %f %f %f\n", result[0], result[1], result[2], result[3]);

    l = _mm_hadd_ps(tmp0, tmp1);
    r = _mm_hadd_ps(tmp2, tmp3);

    __m128 pDest = _mm_hadd_ps(l, r);

    _mm_store_ps(result, pDest);    // fprintf(stderr, "FINAL %f %f %f %f\n", result[0], result[1], result[2], result[3]);
}

void test(float *node1, float *node2, float *node3, float *node4, float *result)
{
    float tmp0[4], tmp1[4], tmp2[4], tmp3[4];
    tmp0[0] = node1[0] + node1[1];
    tmp0[1] = node1[2] + node1[3];
    tmp0[2] = node1[4] + node1[5];
    tmp0[3] = node1[6] + node1[7];

    tmp1[0] = node2[0] + node2[1];
    tmp1[1] = node2[2] + node2[3];
    tmp1[2] = node2[4] + node2[5];
    tmp1[3] = node2[6] + node2[7];

    tmp2[0] = node3[0] + node3[1];
    tmp2[1] = node3[2] + node3[3];
    tmp2[2] = node3[4] + node3[5];
    tmp2[3] = node3[6] + node3[7];

    tmp3[0] = node4[0] + node4[1];
    tmp3[1] = node4[2] + node4[3];
    tmp3[2] = node4[4] + node4[5];
    tmp3[3] = node4[6] + node4[7];

    float l[4], r[4];
    l[0] = tmp0[0] + tmp0[1];
    l[1] = tmp0[2] + tmp0[3];
    l[2] = tmp1[0] + tmp1[1];
    l[3] = tmp1[2] + tmp1[3];

    r[0] = tmp2[0] + tmp2[1];
    r[1] = tmp2[2] + tmp2[3];
    r[2] = tmp3[0] + tmp3[1];
    r[3] = tmp3[2] + tmp3[3];

    result[0] = l[0] + l[1];
    result[1] = l[2] + l[3];
    result[2] = r[0] + r[1];
    result[3] = r[2] + r[3];

}

int main(int argc, char **argv)
{
    int nnodes = 4;
    double t = clock();
    for (int k = 0; k < 10000000; ++k) {
        float *data = new float [nnodes * 8];
        for (int i = 0; i < nnodes * 8; ++i) { data[i] = (i / 8) + 1; /* fprintf(stderr, "data %02d %f\n", i, data[i]); */ }
        float result[4];
        int off = sizeof(float) * 8;
        testSSE(data, data + 8, data + 16, data + 24, result);
        delete [] data;
    }
    fprintf(stderr, "%02f (sec)\n", (clock() - t) / (float)CLOCKS_PER_SEC);
    return 0;
}

编辑：新（更正）代码

#include <iostream>
#include <cmath>

#include <stdio.h>
#include <pmmintrin.h>

inline void testSSE(float *node1, float *node2, float *node3, float *node4, float *result)
{
    __m128 tmp0, tmp1, tmp2, tmp3;

    tmp0 = _mm_load_ps(node1);
    tmp1 = _mm_load_ps(node2);
    tmp2 = _mm_hadd_ps(tmp0, tmp1);

    tmp0 = _mm_load_ps(node3);
    tmp1 = _mm_load_ps(node4);
    tmp3 = _mm_hadd_ps(tmp0, tmp1);

    tmp0 = _mm_hadd_ps(tmp2, tmp3);

    _mm_store_ps(result, tmp0);
}

void test(float *node1, float *node2, float *node3, float *node4, float *result)
{
    float tmp0[4], tmp1[4], tmp2[4], tmp3[4];
    tmp0[0] = node1[0] + node1[1];
    tmp0[1] = node1[2] + node1[3];
    tmp0[2] = node1[4] + node1[5];
    tmp0[3] = node1[6] + node1[7];

    tmp1[0] = node2[0] + node2[1];
    tmp1[1] = node2[2] + node2[3];
    tmp1[2] = node2[4] + node2[5];
    tmp1[3] = node2[6] + node2[7];

    tmp2[0] = node3[0] + node3[1];
    tmp2[1] = node3[2] + node3[3];
    tmp2[2] = node3[4] + node3[5];
    tmp2[3] = node3[6] + node3[7];

    tmp3[0] = node4[0] + node4[1];
    tmp3[1] = node4[2] + node4[3];
    tmp3[2] = node4[4] + node4[5];
    tmp3[3] = node4[6] + node4[7];

    float l[4], r[4];
    l[0] = tmp0[0] + tmp0[1];
    l[1] = tmp0[2] + tmp0[3];
    l[2] = tmp1[0] + tmp1[1];
    l[3] = tmp1[2] + tmp1[3];

    r[0] = tmp2[0] + tmp2[1];
    r[1] = tmp2[2] + tmp2[3];
    r[2] = tmp3[0] + tmp3[1];
    r[3] = tmp3[2] + tmp3[3];

    result[0] = l[0] + l[1];
    result[1] = l[2] + l[3];
    result[2] = r[0] + r[1];
    result[3] = r[2] + r[3];
}

int main(int argc, char **argv)
{

    int nnodes = 4;
    float *data = new float [nnodes * 8];
    for (int i = 0; i < nnodes * 8; ++i) { data[i] = (i / 8) + 1; /* fprintf(stderr, "data %02d %f\n", i, data[i]); */ }
    double t = clock();
    for (int k = 0; k < 1e+9; ++k) {
        float result[4];
        int off = sizeof(float) * 8;
        test(data, data + 8, data + 16, data + 24, result);
    }
    fprintf(stderr, "%02f (sec)\n", (clock() - t) / (float)CLOCKS_PER_SEC);
            delete [] data;
    return 0;
}

score 3 · Accepted Answer

I fixed your code to use SIMD efficiently. Your old method gets 14.1 seconds on my computer and then new method takes 1.2 seconds. I rewrote the code in your test function to make it simpler to read but otherwise it's the same.

The old method stored the nodes in memory like this: node1[0], node1[1],...node1[7], node2[0], node2[1],.... The way you have now is called an Array of Structs (AoS). That's the slow way to use SSE and that's why it's not any better than your scalar code.

The new method which uses SSE store the nodes like this: node1[0], node2[0], node3[0], node4[0], node1[1], node2[1], .... This is called a Struct of Arrays (SoA). That's the efficient way to use SIMD. In general if you're using hadd often (or the dot product instruction) then you probably not using the best algorithm with SIMD.

Here is the code including your old method and my new one. Note, there are several additional ways you could try to make this more efficient, such as unrolling the loop, but now at least the SIMD is being used correctly.

#include <iostream>
#include <cmath>

#include <stdio.h>
#include <pmmintrin.h>

void test(float *node1, float *node2, float *node3, float *node4, float *result)
{
    result[0] = node1[0] + node1[1] + node1[2] + node1[3] + node1[4] + node1[5] + node1[6] + node1[7];
    result[1] = node2[0] + node2[1] + node2[2] + node2[3] + node2[4] + node2[5] + node2[6] + node2[7];
    result[2] = node3[0] + node3[1] + node3[2] + node3[3] + node3[4] + node3[5] + node3[6] + node3[7];
    result[3] = node4[0] + node4[1] + node4[2] + node4[3] + node4[4] + node4[5] + node4[6] + node4[7];
}

void testSSE(float *nodes_soa, float *result)
{
  __m128 sum = _mm_set1_ps(0.0f);
  for(int i=0; i<8; i++) {
    __m128 tmp0 = _mm_load_ps(nodes_soa + 4*i);
    sum =_mm_add_ps(tmp0, sum);      
  }
  _mm_store_ps(result, sum);
}
int main(int argc, char **argv)
{

    int nnodes = 4;
    float *data = new float [nnodes * 8];
    double t;

    //old method using array of structs (AoS)
    for (int i = 0; i < nnodes * 8; ++i) { 
      data[i] = (i / 8) + 1; 
    //  printf("data %02d %f\n", i, data[i]); 
    }

    t = clock();
    for (int k = 0; k < 1e+9; ++k) {
        float result[4];
        int off = sizeof(float) * 8;
        test(data, data + 8, data + 16, data + 24, result);
    //printf("%f %f %f %f\n", result[0], result[1], result[2], result[3]);
    }
    printf("%02f (sec)\n", (clock() - t) / (float)CLOCKS_PER_SEC);

    //new method using struct of arrays (SoA)
    for (int i = 0; i < nnodes * 8; ++i) { 
      data[i] = i%4 + 1; 
      //printf("data %02d %f\n", i, data[i]); 
    }

    t = clock();
    for (int k = 0; k < 1e+9; ++k) {
        float result[4];
        int off = sizeof(float) * 8;
        //test(data, data + 8, data + 16, data + 24, result);
        testSSE(data, result);
    //printf("%f %f %f %f\n", result[0], result[1], result[2], result[3]);
    }
    printf("%02f (sec)\n", (clock() - t) / (float)CLOCKS_PER_SEC);

    delete [] data;
    return 0;
}

Edit: In general you want to use 16 bit alignment in SSE. Here are the functions I normally used.

inline void* aligned_malloc(size_t size, size_t align) {
    void *result;
    #ifdef _MSC_VER 
    result = _aligned_malloc(size, align);
    #else 
     if(posix_memalign(&result, align, size)) result = 0;
    #endif
    return result;
}

inline void aligned_free(void *ptr) {
    #ifdef _MSC_VER 
        _aligned_free(ptr);
    #else 
      free(ptr);
    #endif

}

Use

//float *data = new float [nnodes * 8];
float *data = (float*) aligned_malloc(nnodes*8*sizeof(float), 16);

score 3 · Accepted Answer

你的测试很糟糕。您正在做很多其他事情，与您尝试测试和测量所有内容无关。

你在这里做的最慢的事情是每次调用 new 来分配一个新数组。这可能是这里唯一重要的事情。

如果要测试 SSE，请仅测量 SSE。

根据您的编译器和您的代码编写方式，使用 -O3 它可能使用 SSE 本身来实现您的代码，或者甚至可能是其他一些适合该工作并且执行速度更快的命令集。

c++ - 值得使用 SSE 还是应该只依赖编译器？

2 回答 2

Related

Reference