0

I use vs2012 and want to test the efficiency of SSE and AVX. The code for SSE and AVX is almost the same, except the SSE uses _m128 and AVX uses _m256. I expected the AVX code to be two times faster then the SSE code, But the test result shows their speed is almost the same.

I try to select the /arch:AVX or /arch:SSE or /NOT SET and comment the SSE code or comment the AVX code, whatever I test, the time used for SSE code is about 2138ms and AVX code is about 2106ms. The outer for loop is just used to increase the cycle time,

#include "testfun.h"
#include <iostream>
#include <time.h> 
#include <malloc.h>
#include "immintrin.h"
using namespace std;
#define dataLen  800000

void testfun()
{
float *buf1 = reinterpret_cast<float*>(_aligned_malloc( sizeof(float)*dataLen, 32 ));
float *buf2 = reinterpret_cast<float*>(_aligned_malloc( sizeof(float)*dataLen, 32 ));
for(int i=0; i<dataLen; i++)
{
    buf1[i] = 1;
    buf2[i] = 1;
}
double timePassed;
int t = clock();
float sum = 0;
//=========================SSE CODE=====================================
__m128 *p1 = (__m128 *)buf1;
__m128 *p2 = (__m128 *)buf2;
__m128 _result = _mm_set_ps1(0.0f);

for(int j=0;j<10000; j++)
{   
    p1 = (__m128 *)buf1;
    p2 = (__m128 *)buf2;        
    _result = _mm_sub_ps(_mm_set_ps(j,0,0,0) ,_result);

    for(int i=0; i<dataLen/4; i++)
    {
        _result = _mm_add_ps(_mm_mul_ps(*p1, *p2), _result);
        p1++;
        p2++;
    }
}

sum = _result.m128_f32[0]+_result.m128_f32[1]+_result.m128_f32[2]+_result.m128_f32[3];
timePassed = clock() - t;
std::cout<<std::fixed<<"SSE calculate result : "<<sum<<std::endl;
std::cout<<"SSE time used: "<<timePassed<<"ms"<<std::endl;

//=========================AVX CODE=====================================
t = clock();
__m256  *pp1 ; 
__m256  *pp2 ; 
__m256 _rresult = _mm256_setzero_ps();
sum = 0;

for(int j=0;j<10000; j++)
{   
    pp1 = (__m256*) buf1;
    pp2 = (__m256*) buf2;
    _rresult = _mm256_sub_ps(_mm256_set_ps(j,0,0,0,0,0,0,0), _rresult);

    for(int i=0; i<dataLen/8; i++)
    {       
        _rresult = _mm256_add_ps(_mm256_mul_ps(*pp1, *pp2), _rresult);
        pp1++;
        pp2++;
    }
}

sum = _rresult.m256_f32[0]+_rresult.m256_f32[1]+_rresult.m256_f32[2]+_rresult.m256_f32[3]+_rresult.m256_f32[4]+_rresult.m256_f32[5]+_rresult.m256_f32[6]+_rresult.m256_f32[7];
timePassed = clock() - t;
std::cout<<std::fixed<<"AVX calculate result : "<<sum<<std::endl;
std::cout<<"AVX time used: "<<timePassed<<"ms"<<std::endl;

_aligned_free(buf1);
_aligned_free(buf2);

}

4

1 回答 1

5

You are most likely just bandwidth-limited, since you only have two arithmetic instructions in your loop and you have two loads. If you reduce the size of your data set so that it fits in cache you should then see a difference in performance (since you'll have much greater load bandwidth and reduced latency for loads from cache).

(Also, your timing numbers seem very high - make sure that you are using the release build, i.e. that you have optimisation enabled, otherwise your results will be misleading.)

于 2013-08-30T10:22:49.423 回答