我以前做过这个确切的任务。但主要是测量功耗和CPU温度。下面的代码(相当长)在我的 Core i7 2600K 上实现了接近最优。
这里要注意的关键是大量的手动循环展开以及乘法和加法的交错......
完整的项目可以在我的 GitHub 上找到:https ://github.com/Mysticial/Flops
警告:
如果您决定编译并运行它,请注意您的 CPU 温度!!!
确保不要过热。并确保 CPU 节流不会影响您的结果!
此外,我对运行此代码可能造成的任何损害不承担任何责任。
笔记:
- 此代码针对 x64 进行了优化。x86 没有足够的寄存器来编译。
- 此代码已经过测试,可在 Visual Studio 2010/2012 和 GCC 4.6 上正常运行。
ICC 11 (Intel Compiler 11) 出人意料地难以很好地编译它。
- 这些适用于 FMA 之前的处理器。为了在 Intel Haswell 和 AMD Bulldozer 处理器(及更高版本)上实现峰值 FLOPS,将需要 FMA(Fused Multiply Add)指令。这些超出了本基准的范围。
#include <emmintrin.h>
#include <omp.h>
#include <iostream>
using namespace std;
typedef unsigned long long uint64;
double test_dp_mac_SSE(double x,double y,uint64 iterations){
register __m128d r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
// Generate starting data.
r0 = _mm_set1_pd(x);
r1 = _mm_set1_pd(y);
r8 = _mm_set1_pd(-0.0);
r2 = _mm_xor_pd(r0,r8);
r3 = _mm_or_pd(r0,r8);
r4 = _mm_andnot_pd(r8,r0);
r5 = _mm_mul_pd(r1,_mm_set1_pd(0.37796447300922722721));
r6 = _mm_mul_pd(r1,_mm_set1_pd(0.24253562503633297352));
r7 = _mm_mul_pd(r1,_mm_set1_pd(4.1231056256176605498));
r8 = _mm_add_pd(r0,_mm_set1_pd(0.37796447300922722721));
r9 = _mm_add_pd(r1,_mm_set1_pd(0.24253562503633297352));
rA = _mm_sub_pd(r0,_mm_set1_pd(4.1231056256176605498));
rB = _mm_sub_pd(r1,_mm_set1_pd(4.1231056256176605498));
rC = _mm_set1_pd(1.4142135623730950488);
rD = _mm_set1_pd(1.7320508075688772935);
rE = _mm_set1_pd(0.57735026918962576451);
rF = _mm_set1_pd(0.70710678118654752440);
uint64 iMASK = 0x800fffffffffffffull;
__m128d MASK = _mm_set1_pd(*(double*)&iMASK);
__m128d vONE = _mm_set1_pd(1.0);
uint64 c = 0;
while (c < iterations){
size_t i = 0;
while (i < 1000){
// Here's the meat - the part that really matters.
r0 = _mm_mul_pd(r0,rC);
r1 = _mm_add_pd(r1,rD);
r2 = _mm_mul_pd(r2,rE);
r3 = _mm_sub_pd(r3,rF);
r4 = _mm_mul_pd(r4,rC);
r5 = _mm_add_pd(r5,rD);
r6 = _mm_mul_pd(r6,rE);
r7 = _mm_sub_pd(r7,rF);
r8 = _mm_mul_pd(r8,rC);
r9 = _mm_add_pd(r9,rD);
rA = _mm_mul_pd(rA,rE);
rB = _mm_sub_pd(rB,rF);
r0 = _mm_add_pd(r0,rF);
r1 = _mm_mul_pd(r1,rE);
r2 = _mm_sub_pd(r2,rD);
r3 = _mm_mul_pd(r3,rC);
r4 = _mm_add_pd(r4,rF);
r5 = _mm_mul_pd(r5,rE);
r6 = _mm_sub_pd(r6,rD);
r7 = _mm_mul_pd(r7,rC);
r8 = _mm_add_pd(r8,rF);
r9 = _mm_mul_pd(r9,rE);
rA = _mm_sub_pd(rA,rD);
rB = _mm_mul_pd(rB,rC);
r0 = _mm_mul_pd(r0,rC);
r1 = _mm_add_pd(r1,rD);
r2 = _mm_mul_pd(r2,rE);
r3 = _mm_sub_pd(r3,rF);
r4 = _mm_mul_pd(r4,rC);
r5 = _mm_add_pd(r5,rD);
r6 = _mm_mul_pd(r6,rE);
r7 = _mm_sub_pd(r7,rF);
r8 = _mm_mul_pd(r8,rC);
r9 = _mm_add_pd(r9,rD);
rA = _mm_mul_pd(rA,rE);
rB = _mm_sub_pd(rB,rF);
r0 = _mm_add_pd(r0,rF);
r1 = _mm_mul_pd(r1,rE);
r2 = _mm_sub_pd(r2,rD);
r3 = _mm_mul_pd(r3,rC);
r4 = _mm_add_pd(r4,rF);
r5 = _mm_mul_pd(r5,rE);
r6 = _mm_sub_pd(r6,rD);
r7 = _mm_mul_pd(r7,rC);
r8 = _mm_add_pd(r8,rF);
r9 = _mm_mul_pd(r9,rE);
rA = _mm_sub_pd(rA,rD);
rB = _mm_mul_pd(rB,rC);
i++;
}
// Need to renormalize to prevent denormal/overflow.
r0 = _mm_and_pd(r0,MASK);
r1 = _mm_and_pd(r1,MASK);
r2 = _mm_and_pd(r2,MASK);
r3 = _mm_and_pd(r3,MASK);
r4 = _mm_and_pd(r4,MASK);
r5 = _mm_and_pd(r5,MASK);
r6 = _mm_and_pd(r6,MASK);
r7 = _mm_and_pd(r7,MASK);
r8 = _mm_and_pd(r8,MASK);
r9 = _mm_and_pd(r9,MASK);
rA = _mm_and_pd(rA,MASK);
rB = _mm_and_pd(rB,MASK);
r0 = _mm_or_pd(r0,vONE);
r1 = _mm_or_pd(r1,vONE);
r2 = _mm_or_pd(r2,vONE);
r3 = _mm_or_pd(r3,vONE);
r4 = _mm_or_pd(r4,vONE);
r5 = _mm_or_pd(r5,vONE);
r6 = _mm_or_pd(r6,vONE);
r7 = _mm_or_pd(r7,vONE);
r8 = _mm_or_pd(r8,vONE);
r9 = _mm_or_pd(r9,vONE);
rA = _mm_or_pd(rA,vONE);
rB = _mm_or_pd(rB,vONE);
c++;
}
r0 = _mm_add_pd(r0,r1);
r2 = _mm_add_pd(r2,r3);
r4 = _mm_add_pd(r4,r5);
r6 = _mm_add_pd(r6,r7);
r8 = _mm_add_pd(r8,r9);
rA = _mm_add_pd(rA,rB);
r0 = _mm_add_pd(r0,r2);
r4 = _mm_add_pd(r4,r6);
r8 = _mm_add_pd(r8,rA);
r0 = _mm_add_pd(r0,r4);
r0 = _mm_add_pd(r0,r8);
// Prevent Dead Code Elimination
double out = 0;
__m128d temp = r0;
out += ((double*)&temp)[0];
out += ((double*)&temp)[1];
return out;
}
void test_dp_mac_SSE(int tds,uint64 iterations){
double *sum = (double*)malloc(tds * sizeof(double));
double start = omp_get_wtime();
#pragma omp parallel num_threads(tds)
{
double ret = test_dp_mac_SSE(1.1,2.1,iterations);
sum[omp_get_thread_num()] = ret;
}
double secs = omp_get_wtime() - start;
uint64 ops = 48 * 1000 * iterations * tds * 2;
cout << "Seconds = " << secs << endl;
cout << "FP Ops = " << ops << endl;
cout << "FLOPs = " << ops / secs << endl;
double out = 0;
int c = 0;
while (c < tds){
out += sum[c++];
}
cout << "sum = " << out << endl;
cout << endl;
free(sum);
}
int main(){
// (threads, iterations)
test_dp_mac_SSE(8,10000000);
system("pause");
}
输出(1 个线程,10000000 次迭代) - 使用 Visual Studio 2010 SP1 编译 - x64 版本:
Seconds = 55.5104
FP Ops = 960000000000
FLOPs = 1.7294e+010
sum = 2.22652
该机器是 Core i7 2600K @ 4.4 GHz。理论 SSE 峰值为 4 flops * 4.4 GHz = 17.6 GFlops。这段代码达到了17.3 GFlops - 不错。
输出(8 个线程,10000000 次迭代) - 使用 Visual Studio 2010 SP1 编译 - x64 版本:
Seconds = 117.202
FP Ops = 7680000000000
FLOPs = 6.55279e+010
sum = 17.8122
理论 SSE 峰值为 4 触发器 * 4 个内核 * 4.4 GHz = 70.4 GFlops。实际是65.5 GFlops。
让我们更进一步。AVX...
#include <immintrin.h>
#include <omp.h>
#include <iostream>
using namespace std;
typedef unsigned long long uint64;
double test_dp_mac_AVX(double x,double y,uint64 iterations){
register __m256d r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF;
// Generate starting data.
r0 = _mm256_set1_pd(x);
r1 = _mm256_set1_pd(y);
r8 = _mm256_set1_pd(-0.0);
r2 = _mm256_xor_pd(r0,r8);
r3 = _mm256_or_pd(r0,r8);
r4 = _mm256_andnot_pd(r8,r0);
r5 = _mm256_mul_pd(r1,_mm256_set1_pd(0.37796447300922722721));
r6 = _mm256_mul_pd(r1,_mm256_set1_pd(0.24253562503633297352));
r7 = _mm256_mul_pd(r1,_mm256_set1_pd(4.1231056256176605498));
r8 = _mm256_add_pd(r0,_mm256_set1_pd(0.37796447300922722721));
r9 = _mm256_add_pd(r1,_mm256_set1_pd(0.24253562503633297352));
rA = _mm256_sub_pd(r0,_mm256_set1_pd(4.1231056256176605498));
rB = _mm256_sub_pd(r1,_mm256_set1_pd(4.1231056256176605498));
rC = _mm256_set1_pd(1.4142135623730950488);
rD = _mm256_set1_pd(1.7320508075688772935);
rE = _mm256_set1_pd(0.57735026918962576451);
rF = _mm256_set1_pd(0.70710678118654752440);
uint64 iMASK = 0x800fffffffffffffull;
__m256d MASK = _mm256_set1_pd(*(double*)&iMASK);
__m256d vONE = _mm256_set1_pd(1.0);
uint64 c = 0;
while (c < iterations){
size_t i = 0;
while (i < 1000){
// Here's the meat - the part that really matters.
r0 = _mm256_mul_pd(r0,rC);
r1 = _mm256_add_pd(r1,rD);
r2 = _mm256_mul_pd(r2,rE);
r3 = _mm256_sub_pd(r3,rF);
r4 = _mm256_mul_pd(r4,rC);
r5 = _mm256_add_pd(r5,rD);
r6 = _mm256_mul_pd(r6,rE);
r7 = _mm256_sub_pd(r7,rF);
r8 = _mm256_mul_pd(r8,rC);
r9 = _mm256_add_pd(r9,rD);
rA = _mm256_mul_pd(rA,rE);
rB = _mm256_sub_pd(rB,rF);
r0 = _mm256_add_pd(r0,rF);
r1 = _mm256_mul_pd(r1,rE);
r2 = _mm256_sub_pd(r2,rD);
r3 = _mm256_mul_pd(r3,rC);
r4 = _mm256_add_pd(r4,rF);
r5 = _mm256_mul_pd(r5,rE);
r6 = _mm256_sub_pd(r6,rD);
r7 = _mm256_mul_pd(r7,rC);
r8 = _mm256_add_pd(r8,rF);
r9 = _mm256_mul_pd(r9,rE);
rA = _mm256_sub_pd(rA,rD);
rB = _mm256_mul_pd(rB,rC);
r0 = _mm256_mul_pd(r0,rC);
r1 = _mm256_add_pd(r1,rD);
r2 = _mm256_mul_pd(r2,rE);
r3 = _mm256_sub_pd(r3,rF);
r4 = _mm256_mul_pd(r4,rC);
r5 = _mm256_add_pd(r5,rD);
r6 = _mm256_mul_pd(r6,rE);
r7 = _mm256_sub_pd(r7,rF);
r8 = _mm256_mul_pd(r8,rC);
r9 = _mm256_add_pd(r9,rD);
rA = _mm256_mul_pd(rA,rE);
rB = _mm256_sub_pd(rB,rF);
r0 = _mm256_add_pd(r0,rF);
r1 = _mm256_mul_pd(r1,rE);
r2 = _mm256_sub_pd(r2,rD);
r3 = _mm256_mul_pd(r3,rC);
r4 = _mm256_add_pd(r4,rF);
r5 = _mm256_mul_pd(r5,rE);
r6 = _mm256_sub_pd(r6,rD);
r7 = _mm256_mul_pd(r7,rC);
r8 = _mm256_add_pd(r8,rF);
r9 = _mm256_mul_pd(r9,rE);
rA = _mm256_sub_pd(rA,rD);
rB = _mm256_mul_pd(rB,rC);
i++;
}
// Need to renormalize to prevent denormal/overflow.
r0 = _mm256_and_pd(r0,MASK);
r1 = _mm256_and_pd(r1,MASK);
r2 = _mm256_and_pd(r2,MASK);
r3 = _mm256_and_pd(r3,MASK);
r4 = _mm256_and_pd(r4,MASK);
r5 = _mm256_and_pd(r5,MASK);
r6 = _mm256_and_pd(r6,MASK);
r7 = _mm256_and_pd(r7,MASK);
r8 = _mm256_and_pd(r8,MASK);
r9 = _mm256_and_pd(r9,MASK);
rA = _mm256_and_pd(rA,MASK);
rB = _mm256_and_pd(rB,MASK);
r0 = _mm256_or_pd(r0,vONE);
r1 = _mm256_or_pd(r1,vONE);
r2 = _mm256_or_pd(r2,vONE);
r3 = _mm256_or_pd(r3,vONE);
r4 = _mm256_or_pd(r4,vONE);
r5 = _mm256_or_pd(r5,vONE);
r6 = _mm256_or_pd(r6,vONE);
r7 = _mm256_or_pd(r7,vONE);
r8 = _mm256_or_pd(r8,vONE);
r9 = _mm256_or_pd(r9,vONE);
rA = _mm256_or_pd(rA,vONE);
rB = _mm256_or_pd(rB,vONE);
c++;
}
r0 = _mm256_add_pd(r0,r1);
r2 = _mm256_add_pd(r2,r3);
r4 = _mm256_add_pd(r4,r5);
r6 = _mm256_add_pd(r6,r7);
r8 = _mm256_add_pd(r8,r9);
rA = _mm256_add_pd(rA,rB);
r0 = _mm256_add_pd(r0,r2);
r4 = _mm256_add_pd(r4,r6);
r8 = _mm256_add_pd(r8,rA);
r0 = _mm256_add_pd(r0,r4);
r0 = _mm256_add_pd(r0,r8);
// Prevent Dead Code Elimination
double out = 0;
__m256d temp = r0;
out += ((double*)&temp)[0];
out += ((double*)&temp)[1];
out += ((double*)&temp)[2];
out += ((double*)&temp)[3];
return out;
}
void test_dp_mac_AVX(int tds,uint64 iterations){
double *sum = (double*)malloc(tds * sizeof(double));
double start = omp_get_wtime();
#pragma omp parallel num_threads(tds)
{
double ret = test_dp_mac_AVX(1.1,2.1,iterations);
sum[omp_get_thread_num()] = ret;
}
double secs = omp_get_wtime() - start;
uint64 ops = 48 * 1000 * iterations * tds * 4;
cout << "Seconds = " << secs << endl;
cout << "FP Ops = " << ops << endl;
cout << "FLOPs = " << ops / secs << endl;
double out = 0;
int c = 0;
while (c < tds){
out += sum[c++];
}
cout << "sum = " << out << endl;
cout << endl;
free(sum);
}
int main(){
// (threads, iterations)
test_dp_mac_AVX(8,10000000);
system("pause");
}
输出(1 个线程,10000000 次迭代) - 使用 Visual Studio 2010 SP1 编译 - x64 版本:
Seconds = 57.4679
FP Ops = 1920000000000
FLOPs = 3.34099e+010
sum = 4.45305
理论 AVX 峰值为 8 flops * 4.4 GHz = 35.2 GFlops。实际是33.4 GFlops。
输出(8 个线程,10000000 次迭代) - 使用 Visual Studio 2010 SP1 编译 - x64 版本:
Seconds = 111.119
FP Ops = 15360000000000
FLOPs = 1.3823e+011
sum = 35.6244
理论 AVX 峰值为 8 触发器 * 4 个内核 * 4.4 GHz = 140.8 GFlops。实际是138.2 GFlops。
现在进行一些解释:
性能关键部分显然是内部循环内的 48 条指令。您会注意到它分为 4 块,每块 12 条指令。这 12 个指令块中的每一个都完全相互独立 - 平均需要 6 个周期来执行。
所以从发布到使用之间有 12 条指令和 6 个周期。乘法的延迟为 5 个周期,因此足以避免延迟停止。
需要规范化步骤来防止数据上溢/下溢。这是必要的,因为无操作代码将缓慢增加/减少数据的大小。
因此,如果您只使用全零并摆脱规范化步骤,实际上可以做得比这更好。然而,由于我编写了测量功耗和温度的基准,我必须确保触发器是在“真实”数据上,而不是零——因为执行单元很可能对使用较少功率的零进行特殊处理并产生更少的热量。
更多结果:
- 英特尔酷睿 i7 920 @ 3.5 GHz
- Windows 7 旗舰版 x64
- Visual Studio 2010 SP1 - x64 版本
主题:1
Seconds = 72.1116
FP Ops = 960000000000
FLOPs = 1.33127e+010
sum = 2.22652
理论 SSE 峰值:4 触发器 * 3.5 GHz = 14.0 GFlops。实际是13.3 GFlops。
主题:8
Seconds = 149.576
FP Ops = 7680000000000
FLOPs = 5.13452e+010
sum = 17.8122
理论 SSE 峰值:4 触发器 * 4 核 * 3.5 GHz = 56.0 GFlops。实际是51.3 GFlops。
在多线程运行中,我的处理器温度达到 76C!如果您运行这些,请确保结果不受 CPU 限制的影响。
- 2 x Intel Xeon X5482 Harpertown @ 3.2 GHz
- Ubuntu Linux 10 x64
- GCC 4.5.2 x64 - (-O2 -msse3 -fopenmp)
主题:1
Seconds = 78.3357
FP Ops = 960000000000
FLOPs = 1.22549e+10
sum = 2.22652
理论 SSE 峰值:4 触发器 * 3.2 GHz = 12.8 GFlops。实际是12.3 GFlops。
主题:8
Seconds = 78.4733
FP Ops = 7680000000000
FLOPs = 9.78676e+10
sum = 17.8122
理论 SSE 峰值:4 触发器 * 8 个内核 * 3.2 GHz = 102.4 GFlops。实际是97.9 GFlops。