Speed up small determinant LU fallback libeigen/eigen!2485
diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h index 1c07260..b2645d8 100644 --- a/Eigen/src/Core/FindCoeff.h +++ b/Eigen/src/Core/FindCoeff.h
@@ -173,6 +173,10 @@ Index& inner) { Index outerSize = eval.outerSize(); Index innerSize = eval.innerSize(); + if (innerSize < PacketSize) { + ScalarImpl::run(eval, func, result, outer, inner); + return; + } Index packetEnd = numext::round_down(innerSize, PacketSize); /* initialization performed in calling function */ @@ -229,6 +233,10 @@ static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) { Index size = eval.size(); + if (size < PacketSize) { + ScalarImpl::run(eval, func, result, index); + return; + } Index packetEnd = numext::round_down(size, PacketSize); /* initialization performed in calling function */
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 33b1fb3..d789f75 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h
@@ -277,6 +277,41 @@ for (Index i = 0; i < rows; ++i) func(dst.row(i), lhsEval.coeff(i, Index(0)) * actual_rhs); } +template <typename Dst> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool outer_product_use_small_assignment(const Dst& dst) { + return dst.rows() <= 16 && dst.cols() <= 16; +} + +template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar> +void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func, + const Scalar& alpha, const false_type&) { + evaluator<Rhs> rhsEval(rhs); + ei_declare_local_nested_eval(Lhs, lhs, Rhs::SizeAtCompileTime, actual_lhs); + const Index rows = dst.rows(); + const Index cols = dst.cols(); + for (Index j = 0; j < cols; ++j) { + const Scalar rhs_j = rhsEval.coeff(Index(0), j); + for (Index i = 0; i < rows; ++i) { + func.assignCoeff(dst.coeffRef(i, j), alpha * (rhs_j * actual_lhs.coeff(i, Index(0)))); + } + } +} + +template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar> +void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func, + const Scalar& alpha, const true_type&) { + evaluator<Lhs> lhsEval(lhs); + ei_declare_local_nested_eval(Rhs, rhs, Lhs::SizeAtCompileTime, actual_rhs); + const Index rows = dst.rows(); + const Index cols = dst.cols(); + for (Index i = 0; i < rows; ++i) { + const Scalar lhs_i = lhsEval.coeff(i, Index(0)); + for (Index j = 0; j < cols; ++j) { + func.assignCoeff(dst.coeffRef(i, j), alpha * (lhs_i * actual_rhs.coeff(Index(0), j))); + } + } +} + template <typename Lhs, typename Rhs> struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> { template <typename T> @@ -317,23 +352,43 @@ template <typename Dst> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>()); + if (internal::outer_product_use_small_assignment(dst)) { + internal::outer_product_selector_run_small(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>(), + Scalar(1), is_row_major<Dst>()); + } else { + internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>()); + } } template <typename Dst> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>()); + if (internal::outer_product_use_small_assignment(dst)) { + internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(), + Scalar(1), is_row_major<Dst>()); + } else { + internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>()); + } } template <typename Dst> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>()); + if (internal::outer_product_use_small_assignment(dst)) { + internal::outer_product_selector_run_small(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>(), + Scalar(1), is_row_major<Dst>()); + } else { + internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>()); + } } template <typename Dst> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { - internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>()); + if (internal::outer_product_use_small_assignment(dst)) { + internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(), + alpha, is_row_major<Dst>()); + } else { + internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>()); + } } };
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index ae4fee3..0427c01 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h
@@ -25,10 +25,7 @@ template <typename Derived, int DeterminantType = Derived::RowsAtCompileTime> struct determinant_impl { - static inline typename traits<Derived>::Scalar run(const Derived& m) { - if (Derived::ColsAtCompileTime == Dynamic && m.rows() == 0) return typename traits<Derived>::Scalar(1); - return m.partialPivLu().determinant(); - } + static inline typename traits<Derived>::Scalar run(const Derived& m) { return internal::partial_lu_determinant(m); } }; template <typename Derived>
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 4f4227e..5a22051 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h
@@ -338,9 +338,10 @@ const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows, cols); - // For small compile-time matrices it is worth processing the last row separately: + // For small compile-time matrices and square runtime matrices it is worth processing the last row separately: // speedup: +100% for 2x2, +10% for others. - const Index endk = UnBlockedAtCompileTime ? size - 1 : size; + const bool process_last_row_separately = UnBlockedAtCompileTime || rows == cols; + const Index endk = process_last_row_separately ? size - 1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; for (Index k = 0; k < endk; ++k) { @@ -366,13 +367,14 @@ first_zero_pivot = k; } - if (k < rows - 1) + // Skip the trailing update for rectangular panels with no remaining columns. + if (rrows > 0 && rcols > 0) lu.bottomRightCorner(fix<RRows>(rrows), fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols)); } // special handling of the last entry - if (UnBlockedAtCompileTime) { + if (process_last_row_separately) { Index k = endk; row_transpositions[k] = PivIndex(k); if (numext::is_exactly_zero(Scoring()(lu(k, k))) && first_zero_pivot == -1) first_zero_pivot = k; @@ -427,7 +429,6 @@ // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0 = lu.block(0, 0, rows, k); BlockType A_2 = lu.block(0, k + bs, rows, tsize); BlockType A11 = lu.block(k, k, bs, bs); BlockType A12 = lu.block(k, k + bs, bs, tsize); @@ -443,9 +444,12 @@ nb_transpositions += nb_transpositions_in_panel; // update permutations and apply them to A_0 - for (Index i = k; i < k + bs; ++i) { - Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k)); - A_0.row(i).swap(A_0.row(piv)); + if (k > 0) { + BlockType A_0 = lu.block(0, 0, rows, k); + for (Index i = k; i < k + bs; ++i) { + Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k)); + A_0.row(i).swap(A_0.row(piv)); + } } if (trows) { @@ -483,6 +487,29 @@ nb_transpositions); } +/** \internal returns the determinant computed from an in-place partial-pivoting + * LU decomposition of \a m without constructing a PartialPivLU object. + */ +template <typename Derived> +typename traits<Derived>::Scalar partial_lu_determinant(const Derived& m) { + typedef typename traits<Derived>::Scalar Scalar; + if (m.rows() == 0) return Scalar(1); + EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) + + typedef typename plain_matrix_type<Derived>::type PlainObject; + typedef Transpositions<PlainObject::RowsAtCompileTime, PlainObject::MaxRowsAtCompileTime, DefaultPermutationIndex> + TranspositionType; + + eigen_assert(m.rows() < NumTraits<DefaultPermutationIndex>::highest()); + PlainObject lu(m); + + TranspositionType row_transpositions(lu.rows()); + typename TranspositionType::StorageIndex nb_transpositions; + partial_lu_inplace(lu, row_transpositions, nb_transpositions); + + return Scalar((nb_transpositions % 2) ? -1 : 1) * lu.diagonal().prod(); +} + } // end namespace internal template <typename MatrixType, typename PermutationIndex>
diff --git a/test/determinant.cpp b/test/determinant.cpp index 33f430b..88ca1dd 100644 --- a/test/determinant.cpp +++ b/test/determinant.cpp
@@ -10,6 +10,96 @@ #include "main.h" #include <Eigen/LU> +#include <algorithm> +#include <limits> +#include <vector> + +template <typename Scalar> +struct determinant_reference_scalar { + typedef long double type; +}; + +template <typename RealScalar> +struct determinant_reference_scalar<std::complex<RealScalar> > { + typedef std::complex<long double> type; +}; + +template <typename MatrixType> +typename determinant_reference_scalar<typename MatrixType::Scalar>::type brute_force_determinant(const MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename determinant_reference_scalar<Scalar>::type ReferenceScalar; + const Index size = m.rows(); + std::vector<Index> permutation(size); + for (Index i = 0; i < size; ++i) permutation[i] = i; + + ReferenceScalar result(0); + do { + int inversions = 0; + for (Index i = 0; i < size; ++i) + for (Index j = i + 1; j < size; ++j) + if (permutation[i] > permutation[j]) ++inversions; + + ReferenceScalar term(1); + for (Index i = 0; i < size; ++i) term *= ReferenceScalar(m(i, permutation[i])); + result += inversions % 2 ? -term : term; + } while (std::next_permutation(permutation.begin(), permutation.end())); + + return result; +} + +template <typename MatrixType> +void verify_determinant_against_reference(const MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits<Scalar>::Real RealScalar; + + Scalar expected = Scalar(brute_force_determinant(m)); + Scalar actual = m.determinant(); + const RealScalar max_abs = (std::max)(RealScalar(1), m.cwiseAbs().maxCoeff()); + RealScalar scale = max_abs; + for (Index i = 1; i < m.rows(); ++i) scale *= max_abs; + RealScalar tolerance = RealScalar(100000) * NumTraits<RealScalar>::epsilon() * scale; + RealScalar error = numext::abs(actual - expected); + if (error > tolerance) std::cerr << "determinant error " << error << " exceeds tolerance " << tolerance << std::endl; + VERIFY(error <= tolerance); +} + +template <typename MatrixType> +void determinant_lu_fallback_reference(const MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits<Scalar>::Real RealScalar; + const Index size = m.rows(); + + MatrixType random(size, size); + random.setRandom(); + verify_determinant_against_reference(random); + + MatrixType ill_conditioned = random; + ill_conditioned.col(0) = ill_conditioned.col(1) + RealScalar(1e-8) * ill_conditioned.col(0); + verify_determinant_against_reference(ill_conditioned); + + MatrixType scaled = RealScalar(1e6) * random; + verify_determinant_against_reference(scaled); + + MatrixType tiny = MatrixType::Identity(size, size); + tiny(0, 0) = Scalar((std::numeric_limits<RealScalar>::min)()); + verify_determinant_against_reference(tiny); + +#if !EIGEN_ARCH_ARM + MatrixType subnormal = MatrixType::Identity(size, size); + subnormal(0, 0) = Scalar(std::numeric_limits<RealScalar>::denorm_min() * RealScalar(16)); + verify_determinant_against_reference(subnormal); +#endif +} + +void determinant_non_finite_lu_fallback() { + Matrix<double, 5, 5> m = Matrix<double, 5, 5>::Identity(); + m(0, 0) = std::numeric_limits<double>::quiet_NaN(); + VERIFY((numext::isnan)(m.determinant())); + + m = Matrix<double, 5, 5>::Identity(); + m(0, 0) = std::numeric_limits<double>::infinity(); + VERIFY((numext::isinf)(m.determinant())); +} template <typename MatrixType> void determinant(const MatrixType& m) { @@ -57,9 +147,13 @@ CALL_SUBTEST_2(determinant(Matrix<double, 2, 2>())); CALL_SUBTEST_3(determinant(Matrix<double, 3, 3>())); CALL_SUBTEST_4(determinant(Matrix<double, 4, 4>())); - CALL_SUBTEST_5(determinant(Matrix<std::complex<double>, 10, 10>())); + CALL_SUBTEST_5(determinant(Matrix<double, 5, 5>())); + CALL_SUBTEST_6(determinant(Matrix<std::complex<double>, 10, 10>())); + CALL_SUBTEST_7(determinant_lu_fallback_reference(Matrix<double, 5, 5>())); + CALL_SUBTEST_8(determinant_lu_fallback_reference(MatrixXcd(5, 5))); + CALL_SUBTEST_9(determinant_non_finite_lu_fallback()); s = internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 4); - CALL_SUBTEST_6(determinant(MatrixXd(s, s))); + CALL_SUBTEST_10(determinant(MatrixXd(s, s))); TEST_SET_BUT_UNUSED_VARIABLE(s); } }