我一直在尝试std::variant
使用std::visit
.
这个想法是,而不是
struct A {
virtual unsigned f() const noexcept = 0;
virtual ~A() noexcept {}
};
struct B0 : A {
virtual unsigned f() const noexcept { return 0; }
};
// more types like B0 ...
std::unique_ptr<A> a = std::make_unique<B0>();
我有
struct C0 {
unsigned f() const noexcpet { return 0; }
};
std::variant<C0,C1,C2,/*...*/> c = C0();
我想测量构建一系列此类对象的速度有多快以及调度的速度有多快。请注意,第一个示例(A/Bs...)在动态调度之上需要动态内存,而第二个示例(Cs...)具有自动存储。
为此,我将 B0 和 C0 推广到类型模板中:
template <unsigned X>
struct B : A {
virtual unsigned f() const noexcept override { return X; }
};
template <unsigned X>
struct C {
unsigned f() const noexcept { return X; }
};
然后编写了一个(可能稍微过度设计的)测试工具来填充 astd::vector
并将其读回。完整的代码附在下面。我正在使用C++17运行-O1
它。-O3
bs
实际上,它分别用B<...>
和伪随机地填充预先生成的向量cs
,C<...>
然后调用bs[i]->f()
或std::visit([](auto const& c) { return c.f(); },cs[i])
(有关更多详细信息,请参见附加的基准代码)。
我所期望的是,一个测试实例会将variant<C<0>>
其动态对应部分从水中吹出几个unique_ptr<A>
数量级(确实如此),但是随着我增加变体,开始variant<C<0>,...,C<127>>
的效率visit
会显着下降到这一点动态调度更快的地方(它不像预期的那样)。
使用-O3
(-O1
结果非常相似)我看到以下结果,这些结果在运行中略有不同,但似乎相对稳定(时间大多保持在 10% 的偏差内)。
[0,0] time ctor virtual: 35.0315 ns
[0,0] time ctor variant: 2.9425 ns
[0,0] time call virtual: 14.0037 ns (L1)
[0,0] time call variant: 1.44748 ns (L2)
[0,1] time ctor virtual: 34.8007 ns
[0,1] time ctor variant: 2.95368 ns
[0,1] time call virtual: 19.6874 ns
[0,1] time call variant: 7.04521 ns
[0,7] time ctor virtual: 39.6325 ns
[0,7] time ctor variant: 2.97607 ns
[0,7] time call virtual: 30.7592 ns
[0,7] time call variant: 9.22505 ns (L4.1)
[0,31] time ctor virtual: 35.0002 ns
[0,31] time ctor variant: 2.95473 ns
[0,31] time call virtual: 24.3198 ns
[0,31] time call variant: 9.72678 ns (L4.2)
[0,127] time ctor virtual: 36.5918 ns
[0,127] time ctor variant: 2.95542 ns
[0,127] time call virtual: 26.701 ns (L3)
[0,127] time call variant: 9.88592 ns (L4.3)
讨论
我认为,(L1)的小时间可以通过缓存和/或分支预测来解释。(L2) 完全符合预期:如果变体是微不足道的,则调度非常快。构造的所有时间也是有意义的:ctor variant
在任何时候都没有任何意义malloc
,因此很清楚为什么它比动态类型快得多,ctor virtual
并且无论动态类型的数量如何,时间都大致恒定。
call virtual
与动态类型的数量增加(L3)大致相同,这应该是可以预料的。但是,为什么call variant
(L4.1) 和 (L4.3) 之间没有上升(更多)。
注意:鉴于我的测试工具中模板编程的限制,如果在编译期间不爆炸 g++/耗尽我的内存,我无法进一步增加范围。
无论如何,假设测试功能f
尽可能简单,这意味着测量尽可能准确地记录所产生的开销。
验证
问题是,
- 我如何验证这些结果以使其具有代表性和
- 确保编译器没有优化相关部分?
- 其他基准是否得出相同的结论,即
std::variant
调度总是快大约 2-3 倍?
完整的基准
// g++ -Wall -Wextra -pedantic -std=c++17 -O3 a.cpp
#include <random>
#include <memory>
#include <variant>
#include <chrono>
#include <iostream>
using chronores = std::nano;
static constexpr char const resstr[] = "ns";
namespace helper {
template <template <unsigned> typename T, unsigned X, unsigned UB, typename... Args>
struct mkvar {
using type = typename mkvar<T,X+1,UB,Args...,T<X>>::type;
};
template <template <unsigned> typename T, unsigned UB, typename... Args>
struct mkvar<T,UB,UB,Args...> {
using type = std::variant<Args...,T<UB>>;
};
template <template <unsigned> typename T, unsigned LB, unsigned UB>
using mkvar_t = typename mkvar<T,LB,UB>::type;
template <unsigned X>
struct Num {
static constexpr unsigned value = X;
using inc = Num<X+1>;
};
template <typename NumX, typename NumUB, template <unsigned> typename T, bool use_variant>
struct ctor_Num {
static constexpr auto X = NumX::value;
static constexpr auto UB = NumUB::value;
template <typename Container>
static void run(unsigned x, Container& container) {
if (x == X) {
if constexpr (use_variant) {
container.emplace_back(T<X>());
} else {
container.emplace_back(std::make_unique<T<X>>());
}
} else {
ctor_Num<typename NumX::inc,NumUB,T,use_variant>::run(x,container);
}
}
};
template <typename NumX, template <unsigned> typename T, bool use_variant>
struct ctor_Num<typename NumX::inc,NumX,T,use_variant> {
template <typename Container>
static void run(unsigned, Container&) { }
};
template <unsigned X, unsigned UB, template <unsigned> typename T, bool use_variant, typename Container>
inline void ctor(unsigned x, Container& container) {
return ctor_Num<Num<X>,Num<UB>,T,use_variant>::run(x,container);
}
struct Time {
double& time;
std::chrono::time_point<std::chrono::steady_clock> start;
Time(double& time) : time(time) {
start = std::chrono::steady_clock::now();
}
~Time() {
auto const finish = std::chrono::steady_clock::now();
time += std::chrono::duration<double,chronores>(finish-start).count();
}
};
}
template <unsigned LB, unsigned UB>
struct measure {
struct A {
virtual unsigned f() const noexcept = 0;
virtual ~A() noexcept {}
};
template <unsigned X>
struct B : A {
virtual unsigned f() const noexcept override { return X; }
};
template <unsigned X>
struct C {
unsigned f() const noexcept { return X; }
};
static void main(std::size_t const N, std::size_t const R = 10, bool warmup = false) {
if (!warmup) main(N,1,true);
using namespace helper;
std::vector<std::unique_ptr<A>> bs;
bs.reserve(N);
std::vector<mkvar_t<C,LB,UB>> cs;
cs.reserve(N);
std::uniform_int_distribution<unsigned> distr(LB,UB);
double time_ctor_virtual = 0;
double time_ctor_variant = 0;
double time_call_virtual = 0;
double time_call_variant = 0;
unsigned volatile sum = 0;
std::mt19937 mt(42); mt.discard(100);
for (std::size_t r = 0; r < R; ++r) {
bs.clear();
cs.clear();
{
Time scope(time_ctor_virtual);
for (std::size_t i = 0; i < N; ++i) {
bs.emplace_back(std::make_unique<B<UB>>());
}
}
{
Time scope(time_ctor_variant);
for (std::size_t i = 0; i < N; ++i) {
cs.emplace_back(C<UB>());
}
}
bs.clear();
cs.clear();
for (std::size_t i = 0; i < N; ++i) {
auto const rn = distr(mt);
// effectively calls bs.emplace_back(std::make_unique<B<rn>>())
ctor<LB,UB,B,false>(rn,bs);
// effectively calls cs.emplace_back(C<rn>())
ctor<LB,UB,C,true >(rn,cs);
}
{
Time scope(time_call_variant);
for (std::size_t i = 0; i < N; ++i) {
sum += std::visit([](auto const& c) { return c.f(); },cs[i]);
}
}
{
Time scope(time_call_virtual);
for (std::size_t i = 0; i < N; ++i) {
sum += bs[i]->f();
}
}
}
(void)sum;
if (!warmup) {
std::cout << "[" << LB << "," << UB << "] time ctor virtual: " << (time_ctor_virtual/N/R) << " " << resstr << "\n";
std::cout << "[" << LB << "," << UB << "] time ctor variant: " << (time_ctor_variant/N/R) << " " << resstr << "\n";
std::cout << "[" << LB << "," << UB << "] time call virtual: " << (time_call_virtual/N/R) << " " << resstr << "\n";
std::cout << "[" << LB << "," << UB << "] time call variant: " << (time_call_variant/N/R) << " " << resstr << "\n";
}
}
};
int main() {
static constexpr std::size_t N = 400000;
measure<0,0>::main(N);
std::cout << "\n";
measure<0,1>::main(N);
std::cout << "\n";
measure<0,7>::main(N);
std::cout << "\n";
measure<0,31>::main(N);
std::cout << "\n";
measure<0,127>::main(N);
std::cout << "\n";
}