我有以下代码(正常、SSE 和 AVX):
int testSSE(const aligned_vector & ghs, const aligned_vector & lhs) {
int result[4] __attribute__((aligned(16))) = {0};
__m128i vresult = _mm_set1_epi32(0);
__m128i v1, v2, vmax;
for (int k = 0; k < ghs.size(); k += 4) {
v1 = _mm_load_si128((__m128i *) & lhs[k]);
v2 = _mm_load_si128((__m128i *) & ghs[k]);
vmax = _mm_add_epi32(v1, v2);
vresult = _mm_max_epi32(vresult, vmax);
}
_mm_store_si128((__m128i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 4; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testAVX(const aligned_vector & ghs, const aligned_vector & lhs) {
int result[8] __attribute__((aligned(32))) = {0};
__m256i vresult = _mm256_set1_epi32(0);
__m256i v1, v2, vmax;
for (int k = 0; k < ghs.size(); k += 8) {
v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
v2 = _mm256_load_si256((__m256i *) & lhs[k]);
vmax = _mm256_add_epi32(v1, v2);
vresult = _mm256_max_epi32(vresult, vmax);
}
_mm256_store_si256((__m256i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 8; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testNormal(const aligned_vector & ghs, const aligned_vector & lhs) {
int max = 0;
int tempMax;
for (int k = 0; k < ghs.size(); k++) {
tempMax = lhs[k] + ghs[k];
if (max < tempMax) {
max = tempMax;
}
}
return max;
}
所有这些功能都使用以下代码进行测试:
void alignTestSSE() {
aligned_vector lhs;
aligned_vector ghs;
int mySize = 4096;
int FinalResult;
int nofTestCases = 1000;
double time, time1, time2, time3;
vector<int> lhs2;
vector<int> ghs2;
lhs.resize(mySize);
ghs.resize(mySize);
lhs2.resize(mySize);
ghs2.resize(mySize);
srand(1);
for (int k = 0; k < mySize; k++) {
lhs[k] = randomNodeID(1000000);
lhs2[k] = lhs[k];
ghs[k] = randomNodeID(1000000);
ghs2[k] = ghs[k];
}
/* Warming UP */
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testNormal(lhs, ghs);
}
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testSSE(lhs, ghs);
}
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testAVX(lhs, ghs);
}
cout << "===========================" << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testSSE(lhs, ghs);
}
time = timestamp() - time;
time1 = time;
cout << "SSE took " << time << " s" << endl;
cout << "SSE Result: " << FinalResult << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testAVX(lhs, ghs);
}
time = timestamp() - time;
time3 = time;
cout << "AVX took " << time << " s" << endl;
cout << "AVX Result: " << FinalResult << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testNormal(lhs, ghs);
}
time = timestamp() - time;
cout << "Normal took " << time << " s" << endl;
cout << "Normal Result: " << FinalResult << endl;
cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
cout << "===========================" << endl;
ghs.clear();
lhs.clear();
}
在哪里
inline double timestamp() {
struct timeval tp;
gettimeofday(&tp, NULL);
return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}
和
typedef vector<int, aligned_allocator<int, sizeof (int)> > aligned_vector;
是使用https://gist.github.com/donny-dont/1471329的 AlignedAllocator 的对齐向量
我有一个 intel-i7 haswell 4771,以及最新的 Ubuntu 14.04 64bit 和 gcc 4.8.2。一切都是最新的。我用 -march=native -mtune=native -O3 -m64 编译。
结果是:
SSE took 0.000375986 s
SSE Result: 1982689
AVX took 0.000459909 s
AVX Result: 1982689
Normal took 0.00315714 s
Normal Result: 1982689
SpeedUP SSE= 8.39696 s
SpeedUP AVX= 6.8647 s
这表明在 AVX2 上完全相同的代码比 SSE 慢 22%。我做错了什么还是这是正常行为?