6

在 C++ 中,我有一个短的浮动转换,这成为了我的代码的瓶颈。

该代码从一个硬件设备缓冲区转换而来,该缓冲区本身是短路的,这表示来自一个花哨的光子计数器的输入。

float factor=  1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

一些细节

  1. 值应该从 0 到 2^16-1,它代表高灵敏度相机的像素值

  2. 我在一台带有 i7 处理器(i7 960,即 SSE 4.2 和 4.1)的多核 x86 机器上。

  3. 源与 8 位边界对齐(硬件设备的要求)

  4. W*H 总是能被 8 整除,大多数时候 W 和 H 都能被 8 整除

这让我很伤心,我能做些什么吗?

我正在使用 Visual Studios 2012...

4

7 回答 7

10

这是一个基本的 SSE4.1 实现:

__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
    //  Load 8 16-bit ushorts.
    //  vi = {a,b,c,d,e,f,g,h}
    __m128i vi = _mm_load_si128((const __m128i*)(source + i));

    //  Convert to 32-bit integers
    //  vi0 = {a,0,b,0,c,0,d,0}
    //  vi1 = {e,0,f,0,g,0,h,0}
    __m128i vi0 = _mm_cvtepu16_epi32(vi);
    __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

    //  Convert to float
    __m128 vf0 = _mm_cvtepi32_ps(vi0);
    __m128 vf1 = _mm_cvtepi32_ps(vi1);

    //  Multiply
    vf0 = _mm_mul_ps(vf0,factor);
    vf1 = _mm_mul_ps(vf1,factor);

    //  Store
    _mm_store_ps(destination + i + 0,vf0);
    _mm_store_ps(destination + i + 4,vf1);
}

这假设:

  1. source并且destination都对齐到 16 个字节。
  2. W*H是 8 的倍数。

通过进一步展开这个循环可以做得更好。(见下文)


这里的思路如下:

  1. 将 8 个短路加载到单个 SSE 寄存器中。
  2. 将收银机分成两部分:一个带有底部 4 条短裤,另一个带有顶部 4 条短裤。
  3. 将两个寄存器零扩展为 32 位整数。
  4. 将它们都转换为floats。
  5. 乘以系数。
  6. 将它们存储到destination.

编辑 :

我已经有一段时间没有进行这种类型的优化了,所以我继续展开循环。

Core i7 920 @ 3.5 GHz
Visual Studio 2012 - 发布 x64:

Original Loop      : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416

进一步展开导致收益递减。

这是测试代码:

#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;


void default_loop(float *destination,const short* source,float value,int size){
    float factor = 1.0f / value; 
    for (int i = 0; i < size; i++)
    {
        int value = source[i];
        destination[i] = value*factor;
    }
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}
void print_sum(const float *destination,int size){
    float sum = 0;
    for (int i = 0; i < size; i++){
        sum += destination[i];
    }
    cout << sum << endl;
}

int main(){

    int size = 8000;

    short *source       = (short*)_mm_malloc(size * sizeof(short), 16);
    float *destination  = (float*)_mm_malloc(size * sizeof(float), 16);

    for (int i = 0; i < size; i++){
        source[i] = i;
    }

    float value = 1.1;

    int iterations = 1000000;
    clock_t start;

    //  Default Loop
    start = clock();
    for (int it = 0; it < iterations; it++){
        default_loop(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, no unroll
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll1(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    //  Vectorize 8, unroll 2
    start = clock();
    for (int it = 0; it < iterations; it++){
        vectorize8_unroll2(destination,source,value,size);
    }
    cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
    print_sum(destination,size);

    _mm_free(source);
    _mm_free(destination);

    system("pause");
}
于 2013-04-16T11:12:17.303 回答
9

我相信我有最好的答案。我的结果比 Mystical 的要快得多。它们只需要 SSE2,但可以利用 SSE3、SSE4、AVX 甚至 AVX2(如果可用)。您不必更改任何代码。你只需要重新编译。

我跑过三种尺寸:8008、64000 和 2560*1920 = 4915200。我尝试了几种不同的变体。我在下面列出了最重要的。该功能vectorize8_unroll2是神秘的功能。我做了一个改进版的他叫vectorize8_unroll2_parallel. 我认为比神秘的更好的功能vec16_loop_unroll2_fix和是我的功能。vec16_loop_unroll2_parallel_fix如果您使用 AVX 编译,这些函数将自动使用 AVX,但在 SSE4 甚至 SSE2 上都可以正常工作

此外,您写道“W*H 总是能被 8 整除,大多数情况下 W 和 H 都能被 8 整除”。所以我们不能假设 W*H 在所有情况下都能被 16 整除。 vectorize8_unroll2 当 size 不是 16 的倍数时,Mystical 的函数有一个错误(在他的代码中尝试 size=8008,你会明白我的意思)。我的代码没有这样的错误。

我正在使用 Ander Fog 的矢量类进行矢量化。它不是 lib 或 dll 文件。它只是一些头文件。我使用 OpenMP 进行并行化。以下是一些结果:

Intel Xeon E5630 @2.53GHz (supports upto SSE4.2)    
size 8008, size2 8032, iterations 1000000

                        default_loop time: 7.935 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000

size 64000, size2 64000, iterations 100000
                        default_loop time: 6.387 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 5.125 seconds, diff 0.000000
                  vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
     vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000

编辑:我在这个答案的末尾使用 GCC 在带有 AVX 的系统上添加了结果。

下面是代码。代码看起来很长,因为我做了很多交叉检查并测试了很多变体。在http://www.agner.org/optimize/#vectorclass下载矢量类 。将头文件(vectorclass.h、instrset.h、vectorf128.h、vectorf256.h、vectorf256e.h、vectori128.h、vectori256.h、vectori256e.h)复制到您编译的目录中。在 C++/命令行下添加 /D__SSE4_2__。在发布模式下编译。如果你有一个带 AVX 的 CPU,那么请改用 /arch:AVX。在 C++ 属性/语言下添加 OpenMP 支持。

In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp

在下面的代码中,该函数vec16_loop_unroll2_parallel要求数组是 32 的倍数。您可以将数组大小更改为 32 的倍数(这就是 size2 所指的),或者如果这不可能,您可以使用vec16_loop_unroll2_parallel_fix没有此类限制的函数. 无论如何,它一样快。

#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"

#define ROUND_DOWN(x, s) ((x) & ~((s)-1))

inline void* aligned_malloc(size_t size, size_t align) {
    void *result;
    #ifdef _MSC_VER 
    result = _aligned_malloc(size, align);
    #else 
     if(posix_memalign(&result, align, size)) result = 0;
    #endif
    return result;
}

inline void aligned_free(void *ptr) {
    #ifdef _MSC_VER 
        _aligned_free(ptr);
    #else 
      free(ptr);
    #endif

}

void default_loop(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f/value;
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}


void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
    float factor = 1.0f / value;
    #pragma omp parallel for  
    for (int i = 0; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }
}

void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 8) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 4);
  }
}

void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec8us vi = Vec8us().load(source + i);
    Vec4ui vi0 = extend_low(vi);
    Vec4ui vi1 = extend_high(vi);
    Vec4f vf0 = to_float(vi0);
    Vec4f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 4);

    Vec8us vi_new = Vec8us().load(source + i + 8);
    Vec4ui vi2 = extend_low(vi_new);
    Vec4ui vi3 = extend_high(vi_new);
    Vec4f vf2 = to_float(vi2);
    Vec4f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 8);
    vf3.store(destination + i + 12);
  }
}

void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  for (int i = 0; i < size; i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

  }
}

void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;
    for (; i <ROUND_DOWN(size, 32); i += 32) {
    Vec16us vi = Vec16us().load(source + i);

    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i + 0);
    vf1.store(destination + i + 8);

    Vec16us vi_new = Vec16us().load(source + i + 16);

    Vec8ui vi2 = extend_low(vi_new);
    Vec8ui vi3 = extend_high(vi_new);
    Vec8f vf2 = to_float(vi2);
    Vec8f vf3 = to_float(vi3);
    vf2*=factor;
    vf3*=factor;
    vf2.store(destination + i + 16);
    vf3.store(destination + i + 24);

    }
    for (; i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
  float factor=  1.0f/value;
  #pragma omp parallel for
  for (int i = 0; i < size; i += 16) {
    Vec16us vi = Vec16us().load(source + i);
    Vec8ui vi0 = extend_low(vi);
    Vec8ui vi1 = extend_high(vi);
    Vec8f vf0 = to_float(vi0);
    Vec8f vf1 = to_float(vi1);
    vf0*=factor;
    vf1*=factor;
    vf0.store(destination + i);
    vf1.store(destination + i + 8);
  }
}

void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    #pragma omp parallel for
    for (int i = 0; i < size; i += 32) {
        Vec16us vi = Vec16us().load(source + i); 
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16);
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);
    }
}

void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
    float factor=  1.0f/value;
    int i = 0;  
    #pragma omp parallel for 
    for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
        Vec16us vi = Vec16us().load(source + i);  
        Vec8ui vi0 = extend_low(vi);
        Vec8ui vi1 = extend_high(vi);
        Vec8f vf0 = to_float(vi0);
        Vec8f vf1 = to_float(vi1);
        vf0*=factor;
        vf1*=factor;
        vf0.store(destination + i + 0);
        vf1.store(destination + i + 8);

        Vec16us vi_new = Vec16us().load(source + i + 16); 
        Vec8ui vi2 = extend_low(vi_new);
        Vec8ui vi3 = extend_high(vi_new);
        Vec8f vf2 = to_float(vi2);
        Vec8f vf3 = to_float(vi3);
        vf2*=factor;
        vf3*=factor;
        vf2.store(destination + i + 16);
        vf3.store(destination + i + 24);

    }

    for(int i = ROUND_DOWN(size, 32); i < size; i++) {
        int value = source[i];
        destination[i] = value*factor;
    }

}

void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}

void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 8)
    {
        //  Load 8 16-bit ushorts.
        __m128i vi = _mm_load_si128((const __m128i*)(source + i));

        //  Convert to 32-bit integers
        __m128i vi0 = _mm_cvtepu16_epi32(vi);
        __m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

        //  Convert to float
        __m128 vf0 = _mm_cvtepi32_ps(vi0);
        __m128 vf1 = _mm_cvtepi32_ps(vi1);

        //  Multiply
        vf0 = _mm_mul_ps(vf0,factor);
        vf1 = _mm_mul_ps(vf1,factor);

        //  Store
        _mm_store_ps(destination + i + 0,vf0);
        _mm_store_ps(destination + i + 4,vf1);
    }
}



void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
    __m128 factor = _mm_set1_ps(1.0f / value);
    #pragma omp parallel for
    for (int i = 0; i < size; i += 16)
    {
        __m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
        __m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

        //  Split into two registers
        __m128i b0 = _mm_unpackhi_epi64(a0,a0);
        __m128i b1 = _mm_unpackhi_epi64(a1,a1);

        //  Convert to 32-bit integers
        a0 = _mm_cvtepu16_epi32(a0);
        b0 = _mm_cvtepu16_epi32(b0);
        a1 = _mm_cvtepu16_epi32(a1);
        b1 = _mm_cvtepu16_epi32(b1);

        //  Convert to float
        __m128 c0 = _mm_cvtepi32_ps(a0);
        __m128 d0 = _mm_cvtepi32_ps(b0);
        __m128 c1 = _mm_cvtepi32_ps(a1);
        __m128 d1 = _mm_cvtepi32_ps(b1);

        //  Multiply
        c0 = _mm_mul_ps(c0,factor);
        d0 = _mm_mul_ps(d0,factor);
        c1 = _mm_mul_ps(c1,factor);
        d1 = _mm_mul_ps(d1,factor);

        //  Store
        _mm_store_ps(destination + i +  0,c0);
        _mm_store_ps(destination + i +  4,d0);
        _mm_store_ps(destination + i +  8,c1);
        _mm_store_ps(destination + i + 12,d1);
    }
}

void copy_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        b[i] = a[i];
    }
}

float compare_arrays(float* a, float*b, const int size) {
    float sum = 0;
    for(int i=0; i<size; i++) {
        float diff = a[i] - b[i];
        if(diff!=0)  {
            printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
            break;
        }
        sum += diff;
    }
    return sum;
}

void randomize_array(unsigned short* a, const int size) {
    for(int i=0; i<size; i++) {
        float r = (float)rand()/RAND_MAX;
        a[i] = (int)(65536*r);
    }
}

void run(int size, int iterations) {
    int rd = ROUND_DOWN(size, 32);
    int size2 = rd == size ? size : rd + 32;
    float value = 1.1f;

    printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
    unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
    float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
    float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);

    void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
    fp[0] = default_loop;
    fp[1] = vec8_loop;
    fp[2] = vec8_loop_unroll2;
    fp[3] = vec16_loop;
    fp[4] = vec16_loop_unroll2;
    fp[5] = vec16_loop_unroll2_fix;
    fp[6] = vectorize8_unroll1;
    fp[7] = vectorize8_unroll2;

    fp[8] = default_loop_parallel;
    fp[9] = vec8_loop_parallel;
    fp[10] = vec8_loop_unroll2_parallel;
    fp[11] = vec16_loop_parallel;
    fp[12] = vec16_loop_unroll2_parallel;
    fp[13] = vec16_loop_unroll2_parallel_fix;
    fp[14] = vectorize8_unroll1_parallel;
    fp[15] = vectorize8_unroll2_parallel;

    char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
        "default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
        "vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};

    randomize_array(source, size2);

    copy_arrays(destination_old, destination_ref, size);
    fp[0](destination_ref, source, value, size);

    for(int i=0; i<16; i++) {
        copy_arrays(destination_old, destination, size);
        double dtime = omp_get_wtime();
        for (int it = 0; it < iterations; it++){
            fp[i](destination, source, value, size);
        }
        dtime = omp_get_wtime() - dtime;
        float diff = compare_arrays(destination, destination_ref, size);
        printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
    }
    printf("\n");
    aligned_free(source);
    aligned_free(destination);
    aligned_free(destination_old);
    aligned_free(destination_ref);
}
int main() {
    run(8008, 1000000); 
    run(64000, 100000);
    run(2560*1920, 1000);
}

结果 在带有 AVX 的系统上使用 GCC。GCC 自动并行化循环(Visual Studio 由于短路而失败,但如果您尝试 int 则可以)。使用手写的矢量化代码您获得的收益很少。但是,使用多个线程可能会有所帮助,具体取决于数组大小。对于小数组大小 8008,OpenMP 给出了更差的结果。但是,对于较大的数组大小 128000,使用 OpenMP 可以提供更好的结果。对于最大的数组大小 4915200,它完全受内存限制,OpenMP 无济于事。

i7-2600k @ 4.4GHz
size 8008, size2 8032, iterations 1000000
                        default_loop time: 1.319 seconds, diff 0.000000          
              vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
                  vectorize8_unroll2 time: 1.227 seconds, diff 0.000000                
         vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000

size 128000, size2 128000, iterations 100000
                        default_loop time: 2.902 seconds, diff 0.000000                     
              vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
                  vectorize8_unroll2 time: 2.844 seconds, diff 0.000000         
     vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
         vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000

size 4915200, size2 4915200, iterations 1000
                        default_loop time: 2.313 seconds, diff 0.000000
              vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000    
                  vectorize8_unroll2 time: 2.318 seconds, diff 0.000000                
     vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000         
         vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000
于 2013-05-03T15:21:13.823 回答
5

在我的机器上使用 SSE 内在函数 [四核 Athlon,3.3GHz,16GB 的 RAM],g++ -O2优化 [1] 提供了大约 2.5-3 倍的速度。我还在内联汇编器中编写了一个函数来做同样的事情,但它并没有明显更快(同样,这适用于我的机器,请随意在其他机器上运行)。

我尝试了各种尺寸的 H * W,它们都给出了大致相同的结果。

[1] Usingg++ -O3为所有四个功能提供了相同的时间,因为显然-O3启用了“自动矢量化代码”。因此,假设您的编译器支持类似的自动矢量化功能,那么整个事情有点浪费时间。

结果

convert_naive                  sum=4373.98 t=7034751 t/n=7.03475
convert_naive                  sum=4373.98 t=7266738 t/n=7.26674
convert_naive                  sum=4373.98 t=7006154 t/n=7.00615
convert_naive                  sum=4373.98 t=6815329 t/n=6.81533
convert_naive                  sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4                sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4                sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4                sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4                sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4                sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic          sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic          sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic          sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic          sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic          sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm          sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm          sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm          sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm          sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm          sum=4373.98 t=3176522 t/n=3.17652

代码

#include <iostream>
#include <iomanip>
#include <cstdlib> 
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>


#define W 1000
#define H 1000

static __inline__ unsigned long long rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

void convert_naive(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i++)
    {
    int value = source[i];
    destination[i] = value*factor;
    }
}


void convert_unroll4(short *source, float *destination)
{
    float factor=  1.0f/32767;
    for (int i = 0; i < W*H; i+=4)
    {
    int v1 = source[i];
    int v2 = source[i+1];
    int v3 = source[i+2];
    int v4 = source[i+3];
    destination[i]   = v1*factor;
    destination[i+1] = v2*factor;
    destination[i+2] = v3*factor;
    destination[i+3] = v4*factor;
    }
}


void convert_sse_intrinsic(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __m64 zero1 =  { 0,0 };
    __m128i zero2 =  { 0,0 };
    __m64 *ps = reinterpret_cast<__m64 *>(source);
    __m128 *pd = reinterpret_cast<__m128 *>(destination);
    for (int i = 0; i < W*H; i+=4)
    {
    __m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
    value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
    __m128  fval  = _mm_cvtepi32_ps(value);
    *pd = _mm_mul_ps(fval, factor);   // destination[0,1,2,3] = value[0,1,2,3] * factor;
    pd++;
    ps++;
    }
}

void convert_sse_inlineasm(short *source, float *destination)
{
    __m128 factor =  { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
    __asm__ __volatile__(
    "\t pxor       %%xmm1, %%xmm1\n"
    "\t movaps     %3, %%xmm2\n"
    "\t mov        $0, %%rax\n"
    "1:"
    "\t movq       (%1, %%rax), %%xmm0\n"
    "\t movq       8(%1, %%rax), %%xmm3\n"
    "\t movq       16(%1, %%rax), %%xmm4\n"
    "\t movq       24(%1, %%rax), %%xmm5\n"
    "\t punpcklwd  %%xmm1, %%xmm0\n"
    "\t pslld      $16, %%xmm0\n"
    "\t psrad      $16, %%xmm0\n"
    "\t cvtdq2ps   %%xmm0, %%xmm0\n"
    "\t mulps      %%xmm2, %%xmm0\n"
    "\t punpcklwd  %%xmm1, %%xmm3\n"
    "\t pslld      $16, %%xmm3\n"
    "\t psrad      $16, %%xmm3\n"
    "\t cvtdq2ps   %%xmm3, %%xmm3\n"
    "\t mulps      %%xmm2, %%xmm3\n"
    "\t punpcklwd  %%xmm1, %%xmm4\n"
    "\t pslld      $16, %%xmm4\n"
    "\t psrad      $16, %%xmm4\n"
    "\t cvtdq2ps   %%xmm4, %%xmm4\n"
    "\t mulps      %%xmm2, %%xmm4\n"
    "\t punpcklwd  %%xmm1, %%xmm5\n"
    "\t pslld      $16, %%xmm5\n"
    "\t psrad      $16, %%xmm5\n"
    "\t cvtdq2ps   %%xmm5, %%xmm5\n"
    "\t mulps      %%xmm2, %%xmm5\n"
    "\t movaps     %%xmm0, (%0, %%rax, 2)\n"
    "\t movaps     %%xmm3, 16(%0, %%rax, 2)\n"
    "\t movaps     %%xmm4, 32(%0, %%rax, 2)\n"
    "\t movaps     %%xmm5, 48(%0, %%rax, 2)\n"
    "\t addq       $32, %%rax\n"
    "\t cmpq       %2, %%rax\n"
    "\t jbe        1b\n"
    : /* no outputs */ 
    : "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
      "rax", "xmm0", "xmm1", "xmm3");
}




short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif


typedef void (*func)(short *source, float *destination);

struct BmEntry
{
    const char *name;
    func  fn;
};

void bm(BmEntry& e)
{
    memset(outbuffer, 0, sizeof(outbuffer));
    unsigned long long t = rdtsc();
    e.fn(inbuffer, outbuffer);
    t = rdtsc() - t; 

    float sum = 0;
    for(int i = 0; i < W * H; i++)
    {
    sum += outbuffer[i]; 
    }

#if DEBUG
    convert_naive(inbuffer, outbuffer2);
    for(int i = 0; i < W * H; i++)
    {
    if (outbuffer[i] != outbuffer2[i])
    {
        std::cout << i << ":: " << inbuffer[i] << ": " 
              << outbuffer[i] << " != " << outbuffer2[i] 
              << std::endl;
    }
    }
#endif

    std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t << 
    " t/n=" << (double)t / (W * H) << std::endl;
}


#define BM(x) { #x, x }


BmEntry table[] = 
{
    BM(convert_naive),
    BM(convert_unroll4),
    BM(convert_sse_intrinsic),
    BM(convert_sse_inlineasm),
};


int main()
{
    for(int i = 0; i < W * H; i++)
    {
    inbuffer[i] = (short)i;
    }

    for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
    {
    for(int j = 0; j < 5; j++)
        bm(table[i]);
    }
    return 0;
}
于 2013-04-16T12:45:37.220 回答
2

并且您可以使用 OpenMP 来租用您 CPU 的每个内核,而且很简单,只需执行以下操作:

#include <omp.h>
float factor=  1.0f/value;
#pragma omp parallel for 
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
    int value = source[i];//ushort -> int
    destination[i] = value*factor;//int*float->float
}

这是基于之前程序的结果,只需添加如下内容:

#pragma omp parallel for 
for (int it = 0; it < iterations; it++){
 ...
}

然后这是结果

beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp

beta@beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007


beta@beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3


beta@beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007

. .

结果显示使用 openmp 有 100% 的改进。Visual C++ 也支持 openmp。

于 2013-05-03T03:33:07.497 回答
2

这不是一个有效的答案,不要把它当作它,但我实际上想知道使用 256k 查找表的代码会如何表现。(基本上是一个包含 65536 个条目的“短浮动”表)。

我相信 CoreI7 有大约 8 兆字节的缓存,所以查找表适合数据缓存。

我真的很想知道这会如何影响性能:)

于 2013-04-16T10:38:55.503 回答
2

不确定循环中的条件表达式是否只计算一次。你可以试试:

float factor=  1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
    int value = source[i];//short -> int
    destination[i] = value*factor;//int->float
}
于 2013-04-16T07:36:28.077 回答
1

您可以尝试近似表达式

float factor = 1.0f/value;

通过一个分数numerator/denomitator,其中numeratordenominator都是ints。这可以按照您在应用程序中所需的精度来完成,例如

int denominator = 10000;
int numerator = factor * denominator;

然后你可以用整数算术来计算,比如

int value = source[i];
destination[i] = (value * numerator) / numerator;

您必须处理溢出问题,也许您需要切换到long(甚至long long在 64 位系统上)进行计算。

于 2013-04-16T07:46:12.800 回答