c++ - 在编译时使用模板和/或 constexpr 构建函数

Question

代码是即时编写的，并且更改了名称约定，如果我弄得一团糟，很抱歉。我将在这里重写问题以使其更清楚。

在编译时有一些数据是已知的，两个整数数组D和E，都具有 length L。的每个元素D都是零或一。的每个元素都E包含一个值[0,L]。

然后我有一个X在运行时已知的向量，也是长度L。

D我想构建一个使用,E和计算某个值的函数，X例如：

int comp_rt(int i, array<int, L> X) {
    int v = 0;
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

由于这个计算被执行了很多次，我想减少开销，我认为在编译时执行检查D和循环会很棒。E

通常，为了让它更快，而不是使用comp_rt函数 - 这是一般情况，我会编写模板专用函数，对于 each i，只会做数学运算。例如：

N = 5
D = [0, 1, 1, 0, 1] // Values in {0, 1}
E = [1, 0, 3, 2, 4] // Values in [0, L-1]
X = [1, 3, 5, 7, 9] // Any integer

template <int i> int comp(array<int, L> X);
template <> int comp_tpl<0>(array<int, L> X) { return 10; } // D[0] == 0
template <> int comp_tpl<1>(array<int, L> X) { return 0; } // E[1] == 0, skip loop
template <> int comp_tpl<2>(array<int, L> X) { return X[0] + 2 * X[1] + 3 * X[2]; }
template <> int comp_tpl<3>(array<int, L> X) { return 10; }
template <> int comp_tpl<4>(array<int, L> X) { return compl_tpl<2>(X) + 4 * X[3]; }

我的问题是：是否可以使用模板和/或常量表达式在编译时使用Dand构建函数E，但执行速度与comp_tpl? 我的意思是构建“构建要在运行时计算的表达式”的东西，并且只剩下涉及X的计算在运行时。

而且，如果可能的话，它是如何完成的？哪些一般原则可用于解决此类问题？

我尝试使用模板来做到这一点，但生成的代码不如comp_tpl......我认为在运行时评估了一些递归调用。

score 2 · Accepted Answer

编辑：根据相关说明进行了更新：
Edit2：删除了Conditional.

这很像以前计算总和；它是尾递归的^*：

template<class T, size_t Length> struct Sum {
    template<class Array>
    static T comp(const Array &x, T add = 0) {
        return Sum<T, Length - 1>::comp(x, add + Length * x[Length - 1]);
    }
};

template<class T> struct Sum<T, 0> {
    template<class Array>
    static T comp(const Array &x, T add = 0) {
        return add;
    }
};

这是将其组合在一起并取决于d和的部分e。您可能可以对它们进行参数化，但我认为这比值得更麻烦。

constexpr int d[] = { 0, 1, 1, 0, 1 };
constexpr int e[] = { 1, 0, 3, 2, 4 };

template<int N> struct Comp {
    template<class Array>
    static int comp(const Array &x) {
        return d[N] ? Sum<int, e[N]>::comp(x) : 10;
    }
};

用法：

int x[] = { 1, 3, 5, 7, 9 };
Comp<3>::comp(x);

http://ideone.com/PmFBhU

(*) 不是真的，但足够接近。

score 1 · Accepted Answer

（更新：最后讨论了 clang++ 和 g++ 的计时实验。另外，为简单起见，我comp_rt在问题中使用了确切的主体，证明它可以完全优化，而无需我们重写函数主体。）

是的，这是可以做到的。但是，无论如何，g++ 似乎在你没有意识到的情况下为你做了很多这样的事情，请参阅最后的实验。但是，使用 clang++，您确实可以看到运行时版本变慢。

在下面的程序中，除了之外的所有参数X都作为模板参数传递。因此，将为comp_rt您使用的每种参数组合构建不同的模板函数。如果很大，这可能会导致您的二进制文件变L大。

我递的方式一D[i]==0开始可能很难理解。我把它放在一个enable_if. 这里有两种定义comp_tpl，一种是在什么时候使用，一种是在什么时候D[i]==0使用D[i]==1。老实说，这可能是不必要的，我怀疑即使您只是在单个comp_rt函数模板中使用函数的原始主体，代码仍然会以最佳方式编译。（我已经消除了这个并发症）。

我在函数中包含了这样的一行：

    using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;

这证实E[i]了编译器在编译时知道这一点。这等效于 typedef，并且array在编译时必须知道 an 中的元素数量。例如，如果您尝试使用X[i]而不是E[i]作为的大小array，编译器将拒绝该代码。注意：这一行什么都不做，它只是在编译时进行健全性检查。

最后，鉴于E[i]在编译时已知，编译器能够展开循环（如果它认为它会加快循环）。一定要打开所有优化 - gcc 有一个选项-funroll-all-loops。

通过将相关参数作为模板参数传递，编译器可以进行更多优化。但我不确定它会选择这样做！需要进行实验。

这是我用于计时实验的完整程序。

#include<array>
#include<iostream>
using namespace std;

/*
 * L is a positive integer
 * D is vector of booleans of length L
 * E is a vector of ints [0,L) of length L
 * i will be in [0,L) also, therefore it is small enough that we can
 *         treat it as if it's known at compile time also
 *
 * The only thing that is *not* known at compile time is:
 * X is a vector of ints of length L
 *
 * Therefore, our goal is something like:
 *
 *   template<int L, int i, int D[L], int E[L]>
 *   int compute(int X[L]);
 */

template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
typename enable_if< D[i]==0 , int> :: type
comp_tpl(int (&)[L]) {
        return 10;
}
template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
typename enable_if< D[i]==1 , int> :: type
comp_tpl(int (&X)[L]) {
    int v = 0;
    //using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

template<int L, int i, const bool (&D)[L], const int (&E)[L]> // arrays passed, by reference, at compile-time
int
comp_tpl_simple(int (&X)[L]) {
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    int v = 0;
    using confirm_Ei_is_known_at_compile_time = array<char,E[i]>;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}

template<int L> // arrays passed, by reference, at compile-time
int
comp_rt(int i, const bool (&D)[L], const int (&E)[L], int (&X)[L]) {
    if (D[i] == 0) // D[i] known at compile-time
        return 10;
    int v = 0;
    for (int j = 0; j < E[i]; ++j) // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    return v;
}


constexpr int L = 5;
extern constexpr bool D[L] {0, 1, 1, 0, 1};  // Values in {0, 1}
extern constexpr int  E[L] {1, 0, 3, 2, 4}; // Values in [0, L-1]

void change_X_arbitrarily(int (&X)[L]) {
    for(int j=0; j<L; ++j)
        ++X[j];
}

int main() {
    int X[L] {1, 3, 5, 7, 9}; // Any integer

#ifdef USE_RUNTIME
    #define comp(L,i,D,E,X) comp_rt<L>(i,D,E,X)
#endif
#ifdef USE_TEMPLATE
    #define comp(L,i,D,E,X) comp_tpl_simple<L,i,D,E>(X)
#endif

    int total=0;
    for(int outer_reps=0; outer_reps<10000; ++outer_reps) {
        for(int inner_reps=0; inner_reps<100000; ++inner_reps) {
            total += comp(L,0,D,E,X);
            total += comp(L,1,D,E,X);
            total += comp(L,2,D,E,X);
            total += comp(L,3,D,E,X);
            total += comp(L,4,D,E,X);
        }
        change_X_arbitrarily(X);
    }

    cout << total << endl; // should be 39798784
}

请注意我如何使用 a#define来选择要使用的功能。我编译并运行：

$ clang++ SO.cpp -std=gnu++0x -O3 -DUSE_TEMPLATE -o SO && time -p ./SO
39798784  // the total value from all the calls, as a check
real 0.00
user 0.00
sys 0.00

计算 1,000,000,000 次需要零秒！但运行时版本需要 2.7 秒

$ clang++ SO.cpp -std=gnu++0x -O3 -DUSE_RUNTIME -o SO && time -p ./SO
39798784  // the total value from all the calls, as a check
real 2.70
user 2.68
sys 0.00

我在那里使用了clang3.3 -O3。

使用 g++ 4.8.2 时，我收到关于 -O3 未定义行为的警告，但奇怪的是，无论是运行时版本还是模板版本，运行时间都是零秒！也许 g++ 为我们启用了编译时技巧，即使在“运行时”模式下也是如此。这里的教训是，编译器真的比我们更了解优化！

无论如何，如果我退回到，g++-4.8.2 -O2那么无论哪种情况，运行时间都是 6.8 秒。很奇怪！有时添加更多O会减慢速度！

一个解释：在这种情况下，X实际上在编译时是已知的。它是这段代码中的一个局部变量，并且确定性地更新，因此编译器能够完全预测它并在编译时计算答案！看来 g++ 正在这样做（非常令人印象深刻！）。因此，在我最近的实验中，我移出X了main全局变量，现在优化行为“如预期”。一直比现在comp_tpl快得多comp_rt。

score 1 · Accepted Answer

（抱歉添加另一个答案）

（我会把示例代码放在最后）

我的实验和进一步的思考使我相信，原始代码可以按原样使用，只需稍作修改。编译器非常擅长优化，我发现有时很难放慢速度！只有将 volatile 标记为Xvolatile，或者使用中的随机数据不断对其进行编辑rand()，我才能真正让“运行时”版本变慢。

首先，如果你只有一个D向量并且只有一个E向量，那么你只需要放在constexpr数组声明的前面。

constexpr int D[] = { 0, 1, 1, 0, 1 };
constexpr int E[] = { 1, 0, 3, 2, 4 };

（如果您有多个这样的向量，并且您想为每个向量准备“预部分编译”函数，我们可以通过模板参数传递它们，正如我在另一个（冗长的）答案中所讨论的那样。）

我们还需要处理i原始函数中的索引：int comp_rt(int i, array<int, L> X);. 它应该是一个模板参数：

template<size_t i>
int comp_rt(array<int, L> X);

函数的主体不需要更改。编译器现在知道i,D[i]和E[i]是常量表达式。涉及D[i]和的表达式E[i]被它们的常数值替换。测试if(D[i]==0)在编译时由if (true)或if (false)根据需要被替换。此外，循环将被展开，因为编译器确切地知道E[i]. 循环展开，编译器可以看到v简直是一笔巨款。在这种情况下，它将用显式和替换它，删除所有零项，并将所有常数项相加，依此类推。所有这些都在编译时完成。我们几乎无能为力来帮助这里的编译器。这相当于可以使用的一些更复杂的模板解决方案。

将 g++ 和 clang++ 与 -O2 和 -O3 一起使用。

在我的一些实验中，gcc无论我需要多少次迭代，他的程序都在零秒内运行。这是因为该算法是确定性的，并且 gcc 可以提前计算会发生的所有事情X（即使我定期更改 X！）。在这种情况下，部分问题在于我创建了X一个局部变量，但教训是编译器可以看到确定性程序并提前计算所有内容，即使您不希望它这样做！ clang这里似乎没有那么积极地优化。

如果您有一些代码比您希望的要慢，并且您可以整理出一段完整的代码来演示慢速代码，那么也许我们可以建议其他小的更改。但我相信简单地使用constexpron data 和 for 的模板参数i就可以了。

在此示例代码中，我进行了另一项更改。我间接使用tuple_size<array<char, D[i]> > :: value而不是简单地使用D_i，这并没有真正改变含义，但我认为它会鼓励旧编译器在编译时进行评估。我的目标是尽可能地匹配代码的原始可读性，例如将整个函数放在一个地方，而不是将其拆分为多个模板。

constexpr int L   = 5;
constexpr int D[] = { 0, 1, 1, 0, 1 };
constexpr int E[] = { 1, 0, 3, 2, 4 };
template<int i>
int comp_rt(array<int, L> X) {
    using D_i_type = array<char, D[i]>;
    int v = 0;
    if (tuple_size<D_i_type>::value == 0) // D[i] known at compile-time
        return 10;
    using E_i_type = array<char, E[i]>;
    for (int j = 0; j < tuple_size<E_i_type>::value; ++j) { // E[i] known at compile-time
        v += X[j] * (j + 1); // X[j] known at run-time
    }
    return v;
}

score -1 · Accepted Answer

使用 constexpr 函数，可以在编译时完成

#include <iostream>
#include <array>
#include <utility>

#include <cstddef>
#include <type_traits>

  /// A type that represents a parameter pack of zero or more integers.
  template<typename T, T... I>
    struct integer_sequence
    {
      static_assert( std::is_integral<T>::value, "Integral type" );

      using type = T;

      static constexpr T size = sizeof...(I);

      /// Generate an integer_sequence with an additional element.
      template<T N>
        using append = integer_sequence<T, I..., N>;

      using next = append<size>;
    };

  template<typename T, T... I>
    constexpr T integer_sequence<T, I...>::size;

  template<std::size_t... I>
    using index_sequence = integer_sequence<std::size_t, I...>;

  namespace detail
  {
    // Metafunction that generates an integer_sequence of T containing [0, N)
    template<typename T, T Nt, std::size_t N>
      struct iota
      {
        static_assert( Nt >= 0, "N cannot be negative" );

        using type = typename iota<T, Nt-1, N-1>::type::next;
      };

    // Terminal case of the recursive metafunction.
    template<typename T, T Nt>
      struct iota<T, Nt, 0ul>
      {
        using type = integer_sequence<T>;
      };
  }


  // make_integer_sequence<T, N> is an alias for integer_sequence<T, 0,...N-1>
  template<typename T, T N>
    using make_integer_sequence = typename detail::iota<T, N, N>::type;

  template<int N>
    using make_index_sequence = make_integer_sequence<std::size_t, N>;


  // index_sequence_for<A, B, C> is an alias for index_sequence<0, 1, 2>
  template<typename... Args>
    using index_sequence_for = make_index_sequence<sizeof...(Args)>;
//--------------------My part starts here-------------------------------------------------------
template <size_t N> constexpr int computebis(int bound,std::array<int,N> X,int j)
{
  return (j<bound) ? X[j]*(j+1) + computebis(bound,X,j+1) : 0;
}

template <size_t N> constexpr int compute2(std::array<int,N> D,
                                            std::array<int,N> E,
                                            std::array<int,N> X,int index)
{
  return (D[index]==0) ? 10 : computebis(E[index],X,0);
}


template <size_t N,std::size_t... Indices> constexpr std::array<int,N> mfill(std::array<int,N> D,
                                                                            std::array<int,N> E,
                                                                            std::array<int,N> X,
                                                                            index_sequence<Indices...>)
{
  return {{ compute2(D,E,X,Indices)... }};
}

template <size_t N> constexpr std::array<int,N> mfill(std::array<int,N> D,std::array<int,N> E,std::array<int,N> X)
{
  return mfill(D,E,X,make_index_sequence<N>{});
}


int main(int argc, char *argv[])
{

  std::array<int,5> D= {0,1,1,0,1};
  std::array<int,5> E= {1,0,3,2,4};
  std::array<int,5> X= {1,3,5,7,9};
  //to be sure that it is done at compil time
  const auto X2 =  mfill(D,E,X);

  for(auto e:X2){
    std::cout<<e<<std::endl;
  }

编辑：代码更新灵感来自在 C++11 中创建 N 元素 constexpr 数组，我在那里完成了所有第一部分

c++ - 在编译时使用模板和/或 constexpr 构建函数

4 回答 4

Related

Reference