Speed up small determinant LU fallback libeigen/eigen!2485

diff --git a/Eigen/src/Core/FindCoeff.h b/Eigen/src/Core/FindCoeff.h
index 1c07260..b2645d8 100644
--- a/Eigen/src/Core/FindCoeff.h
+++ b/Eigen/src/Core/FindCoeff.h

@@ -173,6 +173,10 @@
                                            Index& inner) {
     Index outerSize = eval.outerSize();
     Index innerSize = eval.innerSize();
+    if (innerSize < PacketSize) {
+      ScalarImpl::run(eval, func, result, outer, inner);
+      return;
+    }
     Index packetEnd = numext::round_down(innerSize, PacketSize);
 
     /* initialization performed in calling function */
@@ -229,6 +233,10 @@
 
   static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
     Index size = eval.size();
+    if (size < PacketSize) {
+      ScalarImpl::run(eval, func, result, index);
+      return;
+    }
     Index packetEnd = numext::round_down(size, PacketSize);
 
     /* initialization performed in calling function */

diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index 33b1fb3..d789f75 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h

@@ -277,6 +277,41 @@
   for (Index i = 0; i < rows; ++i) func(dst.row(i), lhsEval.coeff(i, Index(0)) * actual_rhs);
 }
 
+template <typename Dst>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool outer_product_use_small_assignment(const Dst& dst) {
+  return dst.rows() <= 16 && dst.cols() <= 16;
+}
+
+template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar>
+void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                        const Scalar& alpha, const false_type&) {
+  evaluator<Rhs> rhsEval(rhs);
+  ei_declare_local_nested_eval(Lhs, lhs, Rhs::SizeAtCompileTime, actual_lhs);
+  const Index rows = dst.rows();
+  const Index cols = dst.cols();
+  for (Index j = 0; j < cols; ++j) {
+    const Scalar rhs_j = rhsEval.coeff(Index(0), j);
+    for (Index i = 0; i < rows; ++i) {
+      func.assignCoeff(dst.coeffRef(i, j), alpha * (rhs_j * actual_lhs.coeff(i, Index(0))));
+    }
+  }
+}
+
+template <typename Dst, typename Lhs, typename Rhs, typename Func, typename Scalar>
+void EIGEN_DEVICE_FUNC outer_product_selector_run_small(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                        const Scalar& alpha, const true_type&) {
+  evaluator<Lhs> lhsEval(lhs);
+  ei_declare_local_nested_eval(Rhs, rhs, Lhs::SizeAtCompileTime, actual_rhs);
+  const Index rows = dst.rows();
+  const Index cols = dst.cols();
+  for (Index i = 0; i < rows; ++i) {
+    const Scalar lhs_i = lhsEval.coeff(i, Index(0));
+    for (Index j = 0; j < cols; ++j) {
+      func.assignCoeff(dst.coeffRef(i, j), alpha * (lhs_i * actual_rhs.coeff(Index(0), j)));
+    }
+  }
+}
+
 template <typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
   template <typename T>
@@ -317,23 +352,43 @@
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
-    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 Scalar(1), is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+    }
   }
 
   template <typename Dst>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
                                                                   const Scalar& alpha) {
-    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+    if (internal::outer_product_use_small_assignment(dst)) {
+      internal::outer_product_selector_run_small(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>(),
+                                                 alpha, is_row_major<Dst>());
+    } else {
+      internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+    }
   }
 };
 

diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index ae4fee3..0427c01 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h

@@ -25,10 +25,7 @@
 
 template <typename Derived, int DeterminantType = Derived::RowsAtCompileTime>
 struct determinant_impl {
-  static inline typename traits<Derived>::Scalar run(const Derived& m) {
-    if (Derived::ColsAtCompileTime == Dynamic && m.rows() == 0) return typename traits<Derived>::Scalar(1);
-    return m.partialPivLu().determinant();
-  }
+  static inline typename traits<Derived>::Scalar run(const Derived& m) { return internal::partial_lu_determinant(m); }
 };
 
 template <typename Derived>

diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 4f4227e..5a22051 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h

@@ -338,9 +338,10 @@
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows, cols);
-    // For small compile-time matrices it is worth processing the last row separately:
+    // For small compile-time matrices and square runtime matrices it is worth processing the last row separately:
     //  speedup: +100% for 2x2, +10% for others.
-    const Index endk = UnBlockedAtCompileTime ? size - 1 : size;
+    const bool process_last_row_separately = UnBlockedAtCompileTime || rows == cols;
+    const Index endk = process_last_row_separately ? size - 1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
     for (Index k = 0; k < endk; ++k) {
@@ -366,13 +367,14 @@
         first_zero_pivot = k;
       }
 
-      if (k < rows - 1)
+      // Skip the trailing update for rectangular panels with no remaining columns.
+      if (rrows > 0 && rcols > 0)
         lu.bottomRightCorner(fix<RRows>(rrows), fix<RCols>(rcols)).noalias() -=
             lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
     }
 
     // special handling of the last entry
-    if (UnBlockedAtCompileTime) {
+    if (process_last_row_separately) {
       Index k = endk;
       row_transpositions[k] = PivIndex(k);
       if (numext::is_exactly_zero(Scoring()(lu(k, k))) && first_zero_pivot == -1) first_zero_pivot = k;
@@ -427,7 +429,6 @@
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0 = lu.block(0, 0, rows, k);
       BlockType A_2 = lu.block(0, k + bs, rows, tsize);
       BlockType A11 = lu.block(k, k, bs, bs);
       BlockType A12 = lu.block(k, k + bs, bs, tsize);
@@ -443,9 +444,12 @@
 
       nb_transpositions += nb_transpositions_in_panel;
       // update permutations and apply them to A_0
-      for (Index i = k; i < k + bs; ++i) {
-        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
-        A_0.row(i).swap(A_0.row(piv));
+      if (k > 0) {
+        BlockType A_0 = lu.block(0, 0, rows, k);
+        for (Index i = k; i < k + bs; ++i) {
+          Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
+          A_0.row(i).swap(A_0.row(piv));
+        }
       }
 
       if (trows) {
@@ -483,6 +487,29 @@
                  nb_transpositions);
 }
 
+/** \internal returns the determinant computed from an in-place partial-pivoting
+ * LU decomposition of \a m without constructing a PartialPivLU object.
+ */
+template <typename Derived>
+typename traits<Derived>::Scalar partial_lu_determinant(const Derived& m) {
+  typedef typename traits<Derived>::Scalar Scalar;
+  if (m.rows() == 0) return Scalar(1);
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  typedef typename plain_matrix_type<Derived>::type PlainObject;
+  typedef Transpositions<PlainObject::RowsAtCompileTime, PlainObject::MaxRowsAtCompileTime, DefaultPermutationIndex>
+      TranspositionType;
+
+  eigen_assert(m.rows() < NumTraits<DefaultPermutationIndex>::highest());
+  PlainObject lu(m);
+
+  TranspositionType row_transpositions(lu.rows());
+  typename TranspositionType::StorageIndex nb_transpositions;
+  partial_lu_inplace(lu, row_transpositions, nb_transpositions);
+
+  return Scalar((nb_transpositions % 2) ? -1 : 1) * lu.diagonal().prod();
+}
+
 }  // end namespace internal
 
 template <typename MatrixType, typename PermutationIndex>

diff --git a/test/determinant.cpp b/test/determinant.cpp
index 33f430b..88ca1dd 100644
--- a/test/determinant.cpp
+++ b/test/determinant.cpp

@@ -10,6 +10,96 @@
 
 #include "main.h"
 #include <Eigen/LU>
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+template <typename Scalar>
+struct determinant_reference_scalar {
+  typedef long double type;
+};
+
+template <typename RealScalar>
+struct determinant_reference_scalar<std::complex<RealScalar> > {
+  typedef std::complex<long double> type;
+};
+
+template <typename MatrixType>
+typename determinant_reference_scalar<typename MatrixType::Scalar>::type brute_force_determinant(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename determinant_reference_scalar<Scalar>::type ReferenceScalar;
+  const Index size = m.rows();
+  std::vector<Index> permutation(size);
+  for (Index i = 0; i < size; ++i) permutation[i] = i;
+
+  ReferenceScalar result(0);
+  do {
+    int inversions = 0;
+    for (Index i = 0; i < size; ++i)
+      for (Index j = i + 1; j < size; ++j)
+        if (permutation[i] > permutation[j]) ++inversions;
+
+    ReferenceScalar term(1);
+    for (Index i = 0; i < size; ++i) term *= ReferenceScalar(m(i, permutation[i]));
+    result += inversions % 2 ? -term : term;
+  } while (std::next_permutation(permutation.begin(), permutation.end()));
+
+  return result;
+}
+
+template <typename MatrixType>
+void verify_determinant_against_reference(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar expected = Scalar(brute_force_determinant(m));
+  Scalar actual = m.determinant();
+  const RealScalar max_abs = (std::max)(RealScalar(1), m.cwiseAbs().maxCoeff());
+  RealScalar scale = max_abs;
+  for (Index i = 1; i < m.rows(); ++i) scale *= max_abs;
+  RealScalar tolerance = RealScalar(100000) * NumTraits<RealScalar>::epsilon() * scale;
+  RealScalar error = numext::abs(actual - expected);
+  if (error > tolerance) std::cerr << "determinant error " << error << " exceeds tolerance " << tolerance << std::endl;
+  VERIFY(error <= tolerance);
+}
+
+template <typename MatrixType>
+void determinant_lu_fallback_reference(const MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const Index size = m.rows();
+
+  MatrixType random(size, size);
+  random.setRandom();
+  verify_determinant_against_reference(random);
+
+  MatrixType ill_conditioned = random;
+  ill_conditioned.col(0) = ill_conditioned.col(1) + RealScalar(1e-8) * ill_conditioned.col(0);
+  verify_determinant_against_reference(ill_conditioned);
+
+  MatrixType scaled = RealScalar(1e6) * random;
+  verify_determinant_against_reference(scaled);
+
+  MatrixType tiny = MatrixType::Identity(size, size);
+  tiny(0, 0) = Scalar((std::numeric_limits<RealScalar>::min)());
+  verify_determinant_against_reference(tiny);
+
+#if !EIGEN_ARCH_ARM
+  MatrixType subnormal = MatrixType::Identity(size, size);
+  subnormal(0, 0) = Scalar(std::numeric_limits<RealScalar>::denorm_min() * RealScalar(16));
+  verify_determinant_against_reference(subnormal);
+#endif
+}
+
+void determinant_non_finite_lu_fallback() {
+  Matrix<double, 5, 5> m = Matrix<double, 5, 5>::Identity();
+  m(0, 0) = std::numeric_limits<double>::quiet_NaN();
+  VERIFY((numext::isnan)(m.determinant()));
+
+  m = Matrix<double, 5, 5>::Identity();
+  m(0, 0) = std::numeric_limits<double>::infinity();
+  VERIFY((numext::isinf)(m.determinant()));
+}
 
 template <typename MatrixType>
 void determinant(const MatrixType& m) {
@@ -57,9 +147,13 @@
     CALL_SUBTEST_2(determinant(Matrix<double, 2, 2>()));
     CALL_SUBTEST_3(determinant(Matrix<double, 3, 3>()));
     CALL_SUBTEST_4(determinant(Matrix<double, 4, 4>()));
-    CALL_SUBTEST_5(determinant(Matrix<std::complex<double>, 10, 10>()));
+    CALL_SUBTEST_5(determinant(Matrix<double, 5, 5>()));
+    CALL_SUBTEST_6(determinant(Matrix<std::complex<double>, 10, 10>()));
+    CALL_SUBTEST_7(determinant_lu_fallback_reference(Matrix<double, 5, 5>()));
+    CALL_SUBTEST_8(determinant_lu_fallback_reference(MatrixXcd(5, 5)));
+    CALL_SUBTEST_9(determinant_non_finite_lu_fallback());
     s = internal::random<int>(1, EIGEN_TEST_MAX_SIZE / 4);
-    CALL_SUBTEST_6(determinant(MatrixXd(s, s)));
+    CALL_SUBTEST_10(determinant(MatrixXd(s, s)));
     TEST_SET_BUT_UNUSED_VARIABLE(s);
   }
 }