最近我试图做一些性能基准测试,比较std::stack<int, std::vector<int>>
我自己简单的堆栈实现(使用预分配内存)。现在我遇到了一些奇怪的行为。
我想问的第一件事是堆栈基准代码中的这一行:
// std::vector<int> magicVector(10);
当我取消注释这条线时,性能提高了大约 17%(基准时间从 6.5 秒下降到 5.4 秒)。但是该行应该对程序的其余部分没有影响,因为它不会修改任何其他成员。此外,它是 int 的向量还是 double 的向量都没有关系......
我想问的第二件事是我的堆栈实现和std::stack
. 有人告诉我这std::stack
应该和我的堆栈一样快,但结果显示我的“FastStack”快两倍。
结果(未注释的性能增加线):
stack 5.38979
stack 5.34406
stack 5.32404
stack 5.30519
FastStack 2.59635
FastStack 2.59204
FastStack 2.59713
FastStack 2.64814
这些结果来自带有 /O2、/Ot、/Ob2 和其他默认优化的 VS2010 的发布版本。我的 CPU 是 Intel i5 3570k,默认时钟(一个线程 3.6 GHz)。
我将所有代码放在一个文件中,以便任何人都可以轻松测试它。
#define _SECURE_SCL 0
#include <iostream>
#include <vector>
#include <stack>
#include <Windows.h>
using namespace std;
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
// Purpose: High Resolution Timer
//---------------------------------------------------------------------------------
class HRTimer
{
public:
HRTimer();
double GetFrequency(void);
void Start(void) ;
double Stop(void);
double GetTime();
private:
LARGE_INTEGER start;
LARGE_INTEGER stop;
double frequency;
};
HRTimer::HRTimer()
{
frequency = this->GetFrequency();
}
double HRTimer::GetFrequency(void)
{
LARGE_INTEGER proc_freq;
if (!::QueryPerformanceFrequency(&proc_freq))
return -1;
return proc_freq.QuadPart;
}
void HRTimer::Start(void)
{
DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
::QueryPerformanceCounter(&start);
::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
}
double HRTimer::Stop(void)
{
DWORD_PTR oldmask = ::SetThreadAffinityMask(::GetCurrentThread(), 0);
::QueryPerformanceCounter(&stop);
::SetThreadAffinityMask(::GetCurrentThread(), oldmask);
return ((stop.QuadPart - start.QuadPart) / frequency);
}
double HRTimer::GetTime()
{
LARGE_INTEGER time;
::QueryPerformanceCounter(&time);
return time.QuadPart / frequency;
}
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
// Purpose: Should be faster than std::stack
//---------------------------------------------------------------------------------
template <class T>
class FastStack
{
public:
T* st;
int allocationSize;
int lastIndex;
public:
FastStack(int stackSize);
~FastStack();
inline void resize(int newSize);
inline void push(T x);
inline void pop();
inline T getAndRemove();
inline T getLast();
inline void clear();
};
template <class T>
FastStack<T>::FastStack( int stackSize )
{
st = NULL;
this->allocationSize = stackSize;
st = new T[stackSize];
lastIndex = -1;
}
template <class T>
FastStack<T>::~FastStack()
{
delete [] st;
}
template <class T>
void FastStack<T>::clear()
{
lastIndex = -1;
}
template <class T>
T FastStack<T>::getLast()
{
return st[lastIndex];
}
template <class T>
T FastStack<T>::getAndRemove()
{
return st[lastIndex--];
}
template <class T>
void FastStack<T>::pop()
{
--lastIndex;
}
template <class T>
void FastStack<T>::push( T x )
{
st[++lastIndex] = x;
}
template <class T>
void FastStack<T>::resize( int newSize )
{
if (st != NULL)
delete [] st;
st = new T[newSize];
}
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
//---------------------------------------------------------------------------------
// Purpose: Benchmark of std::stack and FastStack
//---------------------------------------------------------------------------------
int main(int argc, char *argv[])
{
#if 1
for (int it = 0; it < 4; it++)
{
std::stack<int, std::vector<int>> bStack;
int x;
for (int i = 0; i < 100; i++) // after this two loops, bStack's capacity will be 141 so there will be no more reallocating
bStack.push(i);
for (int i = 0; i < 100; i++)
bStack.pop();
// std::vector<int> magicVector(10); // when you uncomment this line, performance will magically rise about 18%
HRTimer timer;
timer.Start();
for (int i = 0; i < 2000000000; i++)
{
bStack.push(i);
x = bStack.top();
if (i % 100 == 0 && i != 0)
for (int j = 0; j < 100; j++)
bStack.pop();
}
double totalTime = timer.Stop();
cout << "stack " << totalTime << endl;
}
#endif
//------------------------------------------------------------------------------------
#if 1
for (int it = 0; it < 4; it++)
{
FastStack<int> fstack(200);
int x;
HRTimer timer;
timer.Start();
for (int i = 0; i < 2000000000; i++)
{
fstack.push(i);
x = fstack.getLast();
if (i % 100 == 0 && i != 0)
for (int j = 0; j < 100; j++)
fstack.pop();
}
double totalTime = timer.Stop();
cout << "FastStack " << totalTime << endl;
}
#endif
cout << "Done";
cin.get();
return 0;
}
.
编辑:由于每个人都在谈论我的堆栈实现非常糟糕,我想把事情做好。我在几分钟内创建了该堆栈,并且只实现了我当前需要的几个功能。它从来都不是要替代 std::stack :) 或保存以在所有情况下使用。唯一的目标是达到最大速度和正确的结果。对这个误会很抱歉……我只想知道几个答案……</p>