c++ - 如何验证访问者（变体）静态调度与多态动态调度基准的结果？

Question

我一直在尝试std::variant使用std::visit.

这个想法是，而不是

struct A { 
  virtual unsigned f() const noexcept = 0;
  virtual ~A() noexcept {}
};

struct B0 : A {
  virtual unsigned f() const noexcept { return 0; }
};
// more types like B0 ...
std::unique_ptr<A> a = std::make_unique<B0>();

我有

struct C0 {
  unsigned f() const noexcpet { return 0; }
};
std::variant<C0,C1,C2,/*...*/> c = C0();

我想测量构建一系列此类对象的速度有多快以及调度的速度有多快。请注意，第一个示例（A/Bs...）在动态调度之上需要动态内存，而第二个示例（Cs...）具有自动存储。

为此，我将 B0 和 C0 推广到类型模板中：

template <unsigned X>
struct B : A { 
  virtual unsigned f() const noexcept override { return X; }
};  

template <unsigned X>
struct C { 
  unsigned f() const noexcept { return X; }
};

然后编写了一个（可能稍微过度设计的）测试工具来填充 astd::vector并将其读回。完整的代码附在下面。我正在使用C++17运行-O1它。-O3

bs实际上，它分别用B<...>和伪随机地填充预先生成的向量cs，C<...>然后调用bs[i]->f()或std::visit([](auto const& c) { return c.f(); },cs[i])（有关更多详细信息，请参见附加的基准代码）。

我所期望的是，一个测试实例会将variant<C<0>>其动态对应部分从水中吹出几个unique_ptr<A>数量级（确实如此），但是随着我增加变体，开始variant<C<0>,...,C<127>>的效率visit会显着下降到这一点动态调度更快的地方（它不像预期的那样）。

使用-O3（-O1结果非常相似）我看到以下结果，这些结果在运行中略有不同，但似乎相对稳定（时间大多保持在 10% 的偏差内）。

[0,0]  time ctor virtual: 35.0315 ns
[0,0]  time ctor variant: 2.9425 ns
[0,0]  time call virtual: 14.0037 ns    (L1)
[0,0]  time call variant: 1.44748 ns    (L2)

[0,1]  time ctor virtual: 34.8007 ns
[0,1]  time ctor variant: 2.95368 ns
[0,1]  time call virtual: 19.6874 ns
[0,1]  time call variant: 7.04521 ns

[0,7]  time ctor virtual: 39.6325 ns
[0,7]  time ctor variant: 2.97607 ns
[0,7]  time call virtual: 30.7592 ns
[0,7]  time call variant: 9.22505 ns    (L4.1)

[0,31]  time ctor virtual: 35.0002 ns
[0,31]  time ctor variant: 2.95473 ns
[0,31]  time call virtual: 24.3198 ns
[0,31]  time call variant: 9.72678 ns   (L4.2)

[0,127]  time ctor virtual: 36.5918 ns
[0,127]  time ctor variant: 2.95542 ns
[0,127]  time call virtual: 26.701 ns   (L3)
[0,127]  time call variant: 9.88592 ns  (L4.3)

讨论

我认为，（L1）的小时间可以通过缓存和/或分支预测来解释。(L2) 完全符合预期：如果变体是微不足道的，则调度非常快。构造的所有时间也是有意义的：ctor variant在任何时候都没有任何意义malloc，因此很清楚为什么它比动态类型快得多，ctor virtual并且无论动态类型的数量如何，时间都大致恒定。

call virtual与动态类型的数量增加（L3）大致相同，这应该是可以预料的。但是，为什么call variant(L4.1) 和 (L4.3) 之间没有上升（更多）。

注意：鉴于我的测试工具中模板编程的限制，如果在编译期间不爆炸 g++/耗尽我的内存，我无法进一步增加范围。

无论如何，假设测试功能f尽可能简单，这意味着测量尽可能准确地记录所产生的开销。

验证

问题是，

我如何验证这些结果以使其具有代表性和
确保编译器没有优化相关部分？
其他基准是否得出相同的结论，即std::variant调度总是快大约 2-3 倍？

完整的基准

// g++ -Wall -Wextra -pedantic -std=c++17 -O3 a.cpp

#include <random>
#include <memory>
#include <variant>
#include <chrono>
#include <iostream>

using chronores = std::nano;
static constexpr char const resstr[] = "ns";

namespace helper {

  template <template <unsigned> typename T, unsigned X, unsigned UB, typename... Args>
  struct mkvar {
    using type = typename mkvar<T,X+1,UB,Args...,T<X>>::type;
  };
  template <template <unsigned> typename T, unsigned UB, typename... Args>
  struct mkvar<T,UB,UB,Args...> {
    using type = std::variant<Args...,T<UB>>;
  };
  template <template <unsigned> typename T, unsigned LB, unsigned UB>
  using mkvar_t = typename mkvar<T,LB,UB>::type;

  template <unsigned X>
  struct Num {
    static constexpr unsigned value = X;
    using inc = Num<X+1>;
  };

  template <typename NumX, typename NumUB, template <unsigned> typename T, bool use_variant>
  struct ctor_Num {
    static constexpr auto X = NumX::value;
    static constexpr auto UB = NumUB::value;
    template <typename Container>
    static void run(unsigned x, Container& container) {
      if (x == X) {
        if constexpr (use_variant) {
          container.emplace_back(T<X>());
        } else {
          container.emplace_back(std::make_unique<T<X>>());
        }
      } else {
        ctor_Num<typename NumX::inc,NumUB,T,use_variant>::run(x,container);
      }
    }
  };
  template <typename NumX, template <unsigned> typename T, bool use_variant>
  struct ctor_Num<typename NumX::inc,NumX,T,use_variant> {
    template <typename Container>
    static void run(unsigned, Container&) { }
  };

  template <unsigned X, unsigned UB, template <unsigned> typename T, bool use_variant, typename Container>
  inline void ctor(unsigned x, Container& container) {
    return ctor_Num<Num<X>,Num<UB>,T,use_variant>::run(x,container);
  }

  struct Time {
    double& time;
    std::chrono::time_point<std::chrono::steady_clock> start;
    Time(double& time) : time(time) {
      start = std::chrono::steady_clock::now();
    }
    ~Time() {
      auto const finish = std::chrono::steady_clock::now();
      time += std::chrono::duration<double,chronores>(finish-start).count();
    }
  };

}

template <unsigned LB, unsigned UB>
struct measure {

  struct A {
    virtual unsigned f() const noexcept = 0;
    virtual ~A() noexcept {}
  };

  template <unsigned X>
  struct B : A {
    virtual unsigned f() const noexcept override { return X; }
  };

  template <unsigned X>
  struct C {
    unsigned f() const noexcept { return X; }
  };


  static void main(std::size_t const N, std::size_t const R = 10, bool warmup = false) {
    if (!warmup) main(N,1,true);
    using namespace helper;
    std::vector<std::unique_ptr<A>> bs;
    bs.reserve(N);
    std::vector<mkvar_t<C,LB,UB>> cs;
    cs.reserve(N);
    std::uniform_int_distribution<unsigned> distr(LB,UB);
    double time_ctor_virtual = 0;
    double time_ctor_variant = 0;
    double time_call_virtual = 0;
    double time_call_variant = 0;
    unsigned volatile sum = 0;
    std::mt19937 mt(42); mt.discard(100);
    for (std::size_t r = 0; r < R; ++r) {
      bs.clear();
      cs.clear();
      {
        Time scope(time_ctor_virtual);
        for (std::size_t i = 0; i < N; ++i) {
          bs.emplace_back(std::make_unique<B<UB>>());
        }
      }
      {
        Time scope(time_ctor_variant);
        for (std::size_t i = 0; i < N; ++i) {
          cs.emplace_back(C<UB>());
        }
      }
      bs.clear();
      cs.clear();
      for (std::size_t i = 0; i < N; ++i) {
        auto const rn = distr(mt);
        // effectively calls bs.emplace_back(std::make_unique<B<rn>>())
        ctor<LB,UB,B,false>(rn,bs);
        // effectively calls cs.emplace_back(C<rn>())
        ctor<LB,UB,C,true >(rn,cs);
      }
      {
        Time scope(time_call_variant);
        for (std::size_t i = 0; i < N; ++i) {
          sum += std::visit([](auto const& c) { return c.f(); },cs[i]);
        }
      }
      {
        Time scope(time_call_virtual);
        for (std::size_t i = 0; i < N; ++i) {
          sum += bs[i]->f();
        }
      }
    }
    (void)sum;
    if (!warmup) {
      std::cout << "[" << LB << "," << UB << "]  time ctor virtual: " << (time_ctor_virtual/N/R) << " " << resstr << "\n";
      std::cout << "[" << LB << "," << UB << "]  time ctor variant: " << (time_ctor_variant/N/R) << " " << resstr << "\n";
      std::cout << "[" << LB << "," << UB << "]  time call virtual: " << (time_call_virtual/N/R) << " " << resstr << "\n";
      std::cout << "[" << LB << "," << UB << "]  time call variant: " << (time_call_variant/N/R) << " " << resstr << "\n";
    }
  }
};

int main() {
  static constexpr std::size_t N = 400000;
  measure<0,0>::main(N);
  std::cout << "\n";
  measure<0,1>::main(N);
  std::cout << "\n";
  measure<0,7>::main(N);
  std::cout << "\n";
  measure<0,31>::main(N);
  std::cout << "\n";
  measure<0,127>::main(N);
  std::cout << "\n";
}

c++ - 如何验证访问者（变体）静态调度与多态动态调度基准的结果？

0 回答 0

Related

Reference