我正在研究矩阵乘法中的表达式模板性能。我在矩阵乘法中展开循环,我发现本机双精度数的性能与表达式模板的性能相等,直到表达式的深度变得太大,表达式模板的性能下降。这是代码:
#include <iostream>
#include <vector>
#include <chrono>
template<typename T>
struct Real
{
typedef T RealType;
Real() noexcept : m_real(0) {}
inline explicit Real(RealType real) noexcept
: m_real(real)
{
}
inline RealType value() const noexcept
{
return m_real;
}
template<typename Expr>
void operator+=(const Expr& expr)
{
m_real += expr.value();
}
RealType m_real;
};
#define DEFINE_BINARY_OPERATOR(NAME, OP) \
template<typename Expr1, typename Expr2> \
struct NAME \
{ \
typedef typename Expr1::RealType RealType; \
\
NAME() noexcept {} \
NAME(const Expr1& e1, const Expr2& e2) noexcept \
: m_e1(e1), m_e2(e2) {} \
\
inline RealType value() const noexcept \
{ \
return m_e1.value() OP m_e2.value(); \
} \
\
Expr1 m_e1; \
Expr2 m_e2; \
}; \
template<typename Expr1, typename Expr2> \
inline decltype(auto) operator OP (const Expr1& e1, const Expr2& e2) noexcept\
{ \
return NAME<Expr1, Expr2>(e1, e2); \
} \
DEFINE_BINARY_OPERATOR(Multiply, *)
DEFINE_BINARY_OPERATOR(Add, +)
DEFINE_BINARY_OPERATOR(Subtract, -)
DEFINE_BINARY_OPERATOR(Divide, /)
template<typename T>
struct Matrix
{
explicit Matrix(size_t size)
: m_matrix(size, std::vector<T>(size))
{
}
explicit Matrix(size_t size, const T& intialVal)
: m_matrix(size, std::vector<T>(size, intialVal))
{
}
std::vector<T>& operator[](size_t row) { return m_matrix[row]; }
const std::vector<T>& operator[](size_t row) const { return m_matrix[row]; }
size_t size() const { return m_matrix.size(); }
std::vector<std::vector<T> > m_matrix;
};
#define MATRIX_MULT_KERNEL(N) m1[i][k+N] * m2[j][k+N]
#define MATRIX_MULT_ADD_KERNELS_4(N) \
MATRIX_MULT_KERNEL(N) + MATRIX_MULT_KERNEL(N+1) + MATRIX_MULT_KERNEL(N+2) + MATRIX_MULT_KERNEL(N+3)
template<typename T>
Matrix<T> operator*(const Matrix<T>& m1, const Matrix<T>& m2)
{
if (m1.size() != m2.size())
throw std::runtime_error("wrong sizes");
Matrix<T> m3(m1.size());
for (size_t i = 0; i < m1.size(); ++i)
for (size_t j = 0; j < m1.size(); ++j)
for (size_t k = 0; k < m1.size(); k+=16)
{
auto v0 = MATRIX_MULT_ADD_KERNELS_4(0);
auto v1 = MATRIX_MULT_ADD_KERNELS_4(4);
auto v2 = MATRIX_MULT_ADD_KERNELS_4(8);
auto v3 = MATRIX_MULT_ADD_KERNELS_4(12);
// auto v4 = MATRIX_MULT_ADD_KERNELS_4(16);
// auto v5 = MATRIX_MULT_ADD_KERNELS_4(20);
// auto v6 = MATRIX_MULT_ADD_KERNELS_4(24);
// auto v7 = MATRIX_MULT_ADD_KERNELS_4(28);
auto expr = (v0 + v1 + v2 + v3);// + v4 + v5 + v6 + v7);
m3[i][j] += expr;
}
return m3;
}
decltype(auto) now()
{
return std::chrono::high_resolution_clock::now();
}
decltype(auto) milliseconds(const decltype(now())& start, const decltype(now())& end)
{
return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
}
int main()
{
constexpr static const int SIZE = 1024;
{
Matrix<double> m1(SIZE, 1.0);
Matrix<double> m2(SIZE, 1.0);
auto begin = now();
Matrix<double> m3 = m1 * m2;
auto end = now();
std::cout << milliseconds(begin, end) << "ms" << std::endl;
}
{
Matrix<Real<double> > m1(SIZE, Real<double>(1.0));
Matrix<Real<double> > m2(SIZE, Real<double>(1.0));
auto begin = now();
Matrix<Real<double> > m3 = m1 * m2;
auto end = now();
std::cout << milliseconds(begin, end) << "ms" << std::endl;
}
}
如果我取消注释矩阵乘法循环中的代码,并将 k 增加 32,则表达式模板需要三倍的时间。有谁知道为什么会发生这种情况?
我在 Intel Xeon E3-1225 V2 上的 cygwin 上编译 GCC 5.4。