寻找最好的算法我发现有一个权衡:一方面是实现的复杂性和大常数,另一方面是运行时的复杂性。我选择基于 LU 分解的算法,因为它实现起来非常简单,并且性能足够好。
#include <valarray>
#include <vector>
#include <utility>
#include <cmath>
#include <cstddef>
#include <cassert>
template< typename value_type >
struct math
{
using size_type = std::size_t;
size_type const dimension_;
value_type const & eps;
value_type const zero = value_type(0);
value_type const one = value_type(1);
private :
using vector = std::valarray< value_type >;
using matrix = std::vector< vector >;
matrix matrix_;
matrix minor_;
public :
math(size_type const _dimension,
value_type const & _eps)
: dimension_(_dimension)
, eps(_eps)
, matrix_(dimension_)
, minor_(dimension_ - 1)
{
assert(1 < dimension_);
assert(!(eps < zero));
for (size_type r = 0; r < dimension_; ++r) {
matrix_[r].resize(dimension_);
}
size_type const minor_size = dimension_ - 1;
for (size_type r = 0; r < minor_size; ++r) {
minor_[r].resize(minor_size);
}
}
template< typename rhs = matrix >
void
operator = (rhs const & _matrix)
{
auto irow = std::begin(matrix_);
for (auto const & row_ : _matrix) {
auto icol = std::begin(*irow);
for (auto const & v : row_) {
*icol = v;
++icol;
}
++irow;
}
}
value_type
det(matrix & _matrix,
size_type const _dimension)
{ // calculates lower unit triangular matrix and upper triangular
assert(0 < _dimension);
value_type det_ = one;
for (size_type i = 0; i < _dimension; ++i) {
vector & ri_ = _matrix[i];
using std::abs;
value_type max_ = abs(ri_[i]);
size_type pivot = i;
{
size_type p = i;
while (++p < _dimension) {
value_type y_ = abs(_matrix[p][i]);
if (max_ < y_) {
max_ = std::move(y_);
pivot = p;
}
}
}
if (!(eps < max_)) { // regular?
return zero; // singular
}
if (pivot != i) {
det_ = -det_; // each permutation flips sign of det
ri_.swap(_matrix[pivot]);
}
value_type & dia_ = ri_[i];
det_ *= dia_; // det is multiple of diagonal elements
for (size_type j = 1 + i; j < _dimension; ++j) {
_matrix[j][i] /= dia_;
}
for (size_type a = 1 + i; a < _dimension; ++a) {
vector & a_ = minor_[a - 1];
value_type const & ai_ = _matrix[a][i];
for (size_type b = 1 + i; b < _dimension; ++b) {
a_[b - 1] = ai_ * ri_[b];
}
}
for (size_type a = 1 + i; a < _dimension; ++a) {
vector const & a_ = minor_[a - 1];
vector & ra_ = _matrix[a];
for (size_type b = 1 + i; b < _dimension; ++b) {
ra_[b] -= a_[b - 1];
}
}
}
return det_;
}
value_type
det(size_type const _dimension)
{
return det(matrix_, _dimension);
}
value_type
det()
{
return det(dimension_);
}
};
// main.cpp
#include <iostream>
#include <cstdlib>
int
main()
{
using value_type = double;
value_type const eps = std::numeric_limits< value_type >::epsilon();
std::size_t const dimension_ = 3;
math< value_type > m(dimension_, eps);
m = { // example from https://en.wikipedia.org/wiki/Determinant#Laplace.27s_formula_and_the_adjugate_matrix
{-2.0, 2.0, -3.0},
{-1.0, 1.0, 3.0},
{ 2.0, 0.0, -1.0}
};
std::cout << m.det() << std::endl; // 18
return EXIT_SUCCESS;
}
det()
函数是算法中最热门的函数,它使用它作为一部分。我肯定det()
没有尽可能快,因为运行时性能比较(使用google-pprof)与整个算法的参考实现det()
显示出与.
如何提高det()
功能的性能?哪些明显的优化可以立即应用?我应该更改索引和内存访问顺序还是其他?容器类型?预取?
的典型值dimension_
在 3 到 10 的范围内(但可以是 100,如果value_type
是 mpfr 或其他值)。