我试图通过测量运行带有可预测分支的循环与带有随机分支的循环的时间来很好地理解分支预测。
所以我写了一个程序,它采用以不同顺序排列的 0 和 1 的大数组(即全 0,重复 0-1,全 rand),并根据当前索引是 0 还是 1 迭代数组分支,做时间- 浪费工作。
我预计难以猜测的数组会花费更长的时间来运行,因为分支预测器会更频繁地猜测错误,并且无论时间长短,两组数组上运行之间的时间增量都将保持不变 -浪费工作。
然而,随着浪费时间的工作量增加,阵列之间的运行时间差异增加了很多。
(X 轴是浪费时间的工作量,Y 轴是运行时间)
有人理解这种行为吗?您可以在以下代码中看到我正在运行的代码:
#include <stdlib.h>
#include <time.h>
#include <chrono>
#include <stdio.h>
#include <iostream>
#include <vector>
using namespace std;
static const int s_iArrayLen = 999999;
static const int s_iMaxPipelineLen = 60;
static const int s_iNumTrials = 10;
int doWorkAndReturnMicrosecondsElapsed(int* vals, int pipelineLen){
int* zeroNums = new int[pipelineLen];
int* oneNums = new int[pipelineLen];
for(int i = 0; i < pipelineLen; ++i)
zeroNums[i] = oneNums[i] = 0;
chrono::time_point<chrono::system_clock> start, end;
start = chrono::system_clock::now();
for(int i = 0; i < s_iArrayLen; ++i){
if(vals[i] == 0){
for(int i = 0; i < pipelineLen; ++i)
++zeroNums[i];
}
else{
for(int i = 0; i < pipelineLen; ++i)
++oneNums[i];
}
}
end = chrono::system_clock::now();
int elapsedMicroseconds = (int)chrono::duration_cast<chrono::microseconds>(end-start).count();
//This should never fire, it just exists to guarantee the compiler doesn't compile out our zeroNums/oneNums
for(int i = 0; i < pipelineLen - 1; ++i)
if(zeroNums[i] != zeroNums[i+1] || oneNums[i] != oneNums[i+1])
return -1;
delete[] zeroNums;
delete[] oneNums;
return elapsedMicroseconds;
}
struct TestMethod{
string name;
void (*func)(int, int&);
int* results;
TestMethod(string _name, void (*_func)(int, int&)) { name = _name; func = _func; results = new int[s_iMaxPipelineLen]; }
};
int main(){
srand( (unsigned int)time(nullptr) );
vector<TestMethod> testMethods;
testMethods.push_back(TestMethod("all-zero", [](int index, int& out) { out = 0; } ));
testMethods.push_back(TestMethod("repeat-0-1", [](int index, int& out) { out = index % 2; } ));
testMethods.push_back(TestMethod("repeat-0-0-0-1", [](int index, int& out) { out = (index % 4 == 0) ? 0 : 1; } ));
testMethods.push_back(TestMethod("rand", [](int index, int& out) { out = rand() % 2; } ));
int* vals = new int[s_iArrayLen];
for(int currentPipelineLen = 0; currentPipelineLen < s_iMaxPipelineLen; ++currentPipelineLen){
for(int currentMethod = 0; currentMethod < (int)testMethods.size(); ++currentMethod){
int resultsSum = 0;
for(int trialNum = 0; trialNum < s_iNumTrials; ++trialNum){
//Generate a new array...
for(int i = 0; i < s_iArrayLen; ++i)
testMethods[currentMethod].func(i, vals[i]);
//And record how long it takes
resultsSum += doWorkAndReturnMicrosecondsElapsed(vals, currentPipelineLen);
}
testMethods[currentMethod].results[currentPipelineLen] = (resultsSum / s_iNumTrials);
}
}
cout << "\t";
for(int i = 0; i < s_iMaxPipelineLen; ++i){
cout << i << "\t";
}
cout << "\n";
for (int i = 0; i < (int)testMethods.size(); ++i){
cout << testMethods[i].name.c_str() << "\t";
for(int j = 0; j < s_iMaxPipelineLen; ++j){
cout << testMethods[i].results[j] << "\t";
}
cout << "\n";
}
int end;
cin >> end;
delete[] vals;
}
Pastebin 链接:http://pastebin.com/F0JAu3uw