IncompleteLUT: static row matching to handle zero-diagonal matrices (#2626)

libeigen/eigen!2475

Closes #2626

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 3a84e55..61e8d3c 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -142,7 +142,11 @@
   /** \brief Reports whether previous computation was successful.
    *
    * \returns \c Success if computation was successful,
-   *          \c NumericalIssue if the matrix.appears to be negative.
+   *          \c NumericalIssue if a zero pivot was encountered during
+   *          factorization (the resulting preconditioner is unlikely to be
+   *          usable; the input matrix typically has zero diagonal entries
+   *          that cannot be moved by a static row permutation, e.g. it is
+   *          structurally singular).
    */
   ComputationInfo info() const {
     eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
@@ -157,7 +161,11 @@
 
   /**
    * Compute an incomplete LU factorization with dual threshold on the matrix mat
-   * No pivoting is done in this version
+   * No partial pivoting is done in this version. A static row permutation
+   * (maximum bipartite matching) is computed in \c analyzePattern so that
+   * the permuted matrix has a structurally nonzero diagonal whenever one
+   * exists; without it, the lack of pivoting makes ILUT silently produce
+   * a useless preconditioner on matrices with zero diagonal entries.
    *
    **/
   template <typename MatrixType>
@@ -172,7 +180,7 @@
 
   template <typename Rhs, typename Dest>
   void _solve_impl(const Rhs& b, Dest& x) const {
-    x = m_Pinv * b;
+    x = m_PinvPr * b;
     x = m_lu.template triangularView<UnitLower>().solve(x);
     x = m_lu.template triangularView<Upper>().solve(x);
     x = m_P * x;
@@ -184,6 +192,23 @@
     inline bool operator()(const Index& row, const Index& col, const Scalar&) const { return row != col; }
   };
 
+  template <typename MatrixType>
+  Index computeRowMatching(const MatrixType& amat);
+
+  // Produce a column-major CSC sparsity pattern for `amat` (integers only —
+  // the scalar values are never read or copied). When `amat` is already a
+  // compressed column-major SparseMatrix, `outer`/`inner` point directly into
+  // its index storage; otherwise the pattern is materialized into the local
+  // `outer_buf`/`inner_buf` arrays and pointers are set into them.
+  static void patternColMajor(const SparseMatrix<Scalar, ColMajor, StorageIndex>& amat,
+                              Matrix<StorageIndex, Dynamic, 1>& outer_buf, Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                              const StorageIndex*& outer, const StorageIndex*& inner);
+
+  template <typename MatrixType>
+  static void patternColMajor(const MatrixType& amat, Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                              Matrix<StorageIndex, Dynamic, 1>& inner_buf, const StorageIndex*& outer,
+                              const StorageIndex*& inner);
+
  protected:
   FactorType m_lu;
   RealScalar m_droptol;
@@ -191,8 +216,10 @@
   bool m_analysisIsOk;
   bool m_factorizationIsOk;
   ComputationInfo m_info;
-  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;     // Fill-reducing permutation
-  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;  // Inverse permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;       // Fill-reducing permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;    // Inverse permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pr;      // Static row permutation (matching-based)
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_PinvPr;  // Cached composition m_Pinv * m_Pr for solve
 };
 
 /**
@@ -235,21 +262,221 @@
   return m_lu.template triangularView<Upper>();
 }
 
+// Specialization: amat is already a column-major SparseMatrix.
+// Share its index storage directly when compressed; otherwise materialize the
+// indices (without copying any scalar values) into outer_buf/inner_buf.
+template <typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar, StorageIndex>::patternColMajor(const SparseMatrix<Scalar, ColMajor, StorageIndex>& amat,
+                                                          Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                                                          Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                                                          const StorageIndex*& outer, const StorageIndex*& inner) {
+  if (amat.isCompressed()) {
+    outer = amat.outerIndexPtr();
+    inner = amat.innerIndexPtr();
+    return;
+  }
+  const Index n = amat.cols();
+  const StorageIndex* a_outer = amat.outerIndexPtr();
+  const StorageIndex* a_inner_nz = amat.innerNonZeroPtr();
+  const StorageIndex* a_inner = amat.innerIndexPtr();
+  outer_buf.resize(n + 1);
+  outer_buf(0) = 0;
+  for (Index j = 0; j < n; ++j) outer_buf(j + 1) = outer_buf(j) + a_inner_nz[j];
+  inner_buf.resize(outer_buf(n));
+  for (Index j = 0; j < n; ++j) {
+    const StorageIndex* src = a_inner + a_outer[j];
+    std::copy(src, src + a_inner_nz[j], inner_buf.data() + outer_buf(j));
+  }
+  outer = outer_buf.data();
+  inner = inner_buf.data();
+}
+
+// Generic fallback: any other sparse input (row-major, expressions). Build a
+// column-major pattern via inner iterators — no scalar values are read.
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+void IncompleteLUT<Scalar, StorageIndex>::patternColMajor(const MatrixType_& amat,
+                                                          Matrix<StorageIndex, Dynamic, 1>& outer_buf,
+                                                          Matrix<StorageIndex, Dynamic, 1>& inner_buf,
+                                                          const StorageIndex*& outer, const StorageIndex*& inner) {
+  using internal::convert_index;
+  const Index n = amat.cols();
+  outer_buf.setZero(n + 1);
+  for (Index i = 0; i < amat.outerSize(); ++i)
+    for (typename MatrixType_::InnerIterator it(amat, i); it; ++it) ++outer_buf(it.col() + 1);
+  for (Index j = 0; j < n; ++j) outer_buf(j + 1) += outer_buf(j);
+  inner_buf.resize(outer_buf(n));
+  Matrix<StorageIndex, Dynamic, 1> head = outer_buf.head(n);
+  for (Index i = 0; i < amat.outerSize(); ++i)
+    for (typename MatrixType_::InnerIterator it(amat, i); it; ++it)
+      inner_buf(head(it.col())++) = convert_index<StorageIndex>(it.row());
+  outer = outer_buf.data();
+  inner = inner_buf.data();
+}
+
+// Compute a row permutation m_Pr such that (m_Pr * amat) has a structurally
+// nonzero diagonal wherever one exists. Returns the number of matched columns.
+// Uses a maximum bipartite cardinality matching on the sparsity pattern, with
+// a greedy initialization that prefers the natural diagonal so that matrices
+// already having a nonzero diagonal yield the identity permutation.
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+Index IncompleteLUT<Scalar, StorageIndex>::computeRowMatching(const MatrixType_& amat) {
+  using internal::convert_index;
+  const Index n = amat.rows();
+  // We only need the column-major sparsity pattern; never read scalar values.
+  // Share amat's index storage when it is already a compressed column-major
+  // SparseMatrix, otherwise build a value-free pattern into local arrays.
+  Matrix<StorageIndex, Dynamic, 1> outer_buf;
+  Matrix<StorageIndex, Dynamic, 1> inner_buf;
+  const StorageIndex* outer = nullptr;
+  const StorageIndex* inner = nullptr;
+  patternColMajor(amat, outer_buf, inner_buf, outer, inner);
+
+  const StorageIndex kUnmatched = StorageIndex(-1);
+  // match_row[j] = original row matched to column j; match_col[i] = column matched to row i.
+  std::vector<StorageIndex> match_row(n, kUnmatched);
+  std::vector<StorageIndex> match_col(n, kUnmatched);
+
+  // The matching uses the stored sparsity pattern only and is independent of
+  // numerical values. This preserves the analyzePattern/factorize contract:
+  // the same analysis is reusable for any matrix sharing this stored pattern.
+  // Phase 1: greedy diagonal preference.
+  for (Index j = 0; j < n; ++j) {
+    for (Index k = outer[j]; k < outer[j + 1]; ++k) {
+      if (Index(inner[k]) == j) {
+        match_row[j] = convert_index<StorageIndex>(j);
+        match_col[j] = convert_index<StorageIndex>(j);
+        break;
+      }
+    }
+  }
+  // Phase 2: greedy off-diagonal pickup of any free row.
+  for (Index j = 0; j < n; ++j) {
+    if (match_row[j] != kUnmatched) continue;
+    for (Index k = outer[j]; k < outer[j + 1]; ++k) {
+      Index i = inner[k];
+      if (match_col[i] == kUnmatched) {
+        match_row[j] = convert_index<StorageIndex>(i);
+        match_col[i] = convert_index<StorageIndex>(j);
+        break;
+      }
+    }
+  }
+  // Phase 3: augmenting paths for any column still unmatched.
+  std::vector<StorageIndex> visited(n, kUnmatched);
+  // Iterative DFS: the stack frames are (column, edge index, chosen row).
+  // chosen_row[k] is the row that frame k will commit to if a path is found.
+  std::vector<Index> stack_col;
+  std::vector<Index> stack_pos;
+  std::vector<Index> stack_chosen_row;
+  stack_col.reserve(n);
+  stack_pos.reserve(n);
+  stack_chosen_row.reserve(n);
+
+  for (Index start = 0; start < n; ++start) {
+    if (match_row[start] != kUnmatched) continue;
+    StorageIndex epoch = convert_index<StorageIndex>(start);
+    stack_col.clear();
+    stack_pos.clear();
+    stack_chosen_row.clear();
+    stack_col.push_back(start);
+    stack_pos.push_back(outer[start]);
+    stack_chosen_row.push_back(-1);
+
+    while (!stack_col.empty()) {
+      Index j = stack_col.back();
+      Index pos = stack_pos.back();
+      Index col_end = outer[j + 1];
+      bool advanced = false;
+
+      while (pos < col_end) {
+        Index i = inner[pos];
+        ++pos;
+        if (visited[i] == epoch) continue;
+        visited[i] = epoch;
+
+        if (match_col[i] == kUnmatched) {
+          // Found an augmenting path: commit it.
+          stack_chosen_row.back() = i;
+          stack_pos.back() = pos;
+          for (size_t k = 0; k < stack_col.size(); ++k) {
+            Index col = stack_col[k];
+            Index row = stack_chosen_row[k];
+            match_row[col] = convert_index<StorageIndex>(row);
+            match_col[row] = convert_index<StorageIndex>(col);
+          }
+          stack_col.clear();
+          break;
+        } else {
+          // Descend into the column currently matched to row i.
+          stack_chosen_row.back() = i;
+          stack_pos.back() = pos;
+          Index next_col = match_col[i];
+          stack_col.push_back(next_col);
+          stack_pos.push_back(outer[next_col]);
+          stack_chosen_row.push_back(-1);
+          advanced = true;
+          break;
+        }
+      }
+
+      if (!advanced && !stack_col.empty()) {
+        stack_col.pop_back();
+        stack_pos.pop_back();
+        stack_chosen_row.pop_back();
+      }
+    }
+  }
+
+  // Build the row permutation. Matched columns get their matching row;
+  // any leftover columns are filled in identity-fashion with the leftover rows.
+  m_Pr.resize(n);
+  std::vector<bool> col_used(n, false), row_used(n, false);
+  Index matched = 0;
+  for (Index j = 0; j < n; ++j) {
+    if (match_row[j] != kUnmatched) {
+      m_Pr.indices()(match_row[j]) = convert_index<StorageIndex>(j);
+      col_used[j] = true;
+      row_used[match_row[j]] = true;
+      ++matched;
+    }
+  }
+  Index next_col = 0;
+  for (Index i = 0; i < n; ++i) {
+    if (row_used[i]) continue;
+    while (next_col < n && col_used[next_col]) ++next_col;
+    m_Pr.indices()(i) = convert_index<StorageIndex>(next_col);
+    ++next_col;
+  }
+  return matched;
+}
+
 template <typename Scalar, typename StorageIndex>
 template <typename MatrixType_>
 void IncompleteLUT<Scalar, StorageIndex>::analyzePattern(const MatrixType_& amat) {
-  // Compute the Fill-reducing permutation
+  eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
+  // 1. Compute a static row permutation that makes the diagonal structurally
+  //    nonzero. This is a workaround for the lack of partial pivoting in ILUT.
+  //    For matrices that already have a nonzero diagonal, this returns the
+  //    identity permutation and is essentially free.
+  computeRowMatching(amat);
+
+  // 2. Compute the Fill-reducing permutation on the row-permuted matrix.
   // Since ILUT does not perform any numerical pivoting,
   // it is highly preferable to keep the diagonal through symmetric permutations.
   // To this end, let's symmetrize the pattern and perform AMD on it.
-  SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = amat;
-  SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = amat.transpose();
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = m_Pr * amat;
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = mat1.transpose();
   // FIXME: for a nearly symmetric pattern, mat2+mat1 is appropriate;
   //        for a highly non-symmetric pattern, mat2*mat1 should be preferred.
   SparseMatrix<Scalar, ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA, m_P);
   m_Pinv = m_P.inverse();  // cache the inverse permutation
+  // Cache the composition m_Pinv * m_Pr so _solve_impl applies a single
+  // permutation to the RHS instead of two.
+  m_PinvPr = m_Pinv * m_Pr;
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
   m_isInitialized = true;
@@ -271,10 +498,13 @@
   VectorI ju(n);  // column position of the values in u -- maximum size  is n
   VectorI jr(n);  // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
-  // Apply the fill-reducing permutation
+  // Apply the static row permutation (from analyzePattern), then the
+  // fill-reducing symmetric permutation.
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+  SparseMatrix<Scalar, RowMajor, StorageIndex> row_permuted_mat = m_Pr * amat;
   SparseMatrix<Scalar, RowMajor, StorageIndex> mat;
-  mat = amat.twistedBy(m_Pinv);
+  mat = row_permuted_mat.twistedBy(m_Pinv);
+  Index zero_pivots = 0;
 
   // Initialization
   jr.fill(-1);
@@ -415,7 +645,10 @@
 
     // store the diagonal element
     // apply a shifting rule to avoid zero pivots (we are doing an incomplete factorization)
-    if (u(ii) == Scalar(0)) u(ii) = sqrt(m_droptol) * rownorm;
+    if (u(ii) == Scalar(0)) {
+      u(ii) = sqrt(m_droptol) * rownorm;
+      ++zero_pivots;
+    }
     m_lu.insertBackByOuterInnerUnordered(ii, ii) = u(ii);
 
     // sort the U-part of the row
@@ -441,7 +674,11 @@
   m_lu.makeCompressed();
 
   m_factorizationIsOk = true;
-  m_info = Success;
+  // If we had to shift any zero pivot, the factorization is not faithful to
+  // the input matrix and the resulting preconditioner may be useless.
+  // Report this to the caller via NumericalIssue rather than silently
+  // returning Success.
+  m_info = (zero_pivots == 0) ? Success : NumericalIssue;
 }
 
 }  // end namespace Eigen
diff --git a/test/incomplete_LUT.cpp b/test/incomplete_LUT.cpp
index 7213ecc..b9c1679 100644
--- a/test/incomplete_LUT.cpp
+++ b/test/incomplete_LUT.cpp
@@ -78,6 +78,121 @@
   VERIFY_IS_APPROX(expectedMatU, matU);
 }
 
+// https://gitlab.com/libeigen/eigen/-/issues/2626
+// IncompleteLUT must apply a static row permutation so that matrices whose
+// natural diagonal has zeros (but admit a full bipartite matching on the
+// sparsity pattern) can still be factorized. Without it, ILUT silently
+// produces a useless preconditioner.
+template <typename T>
+void test_zero_diagonal_2626() {
+  typedef Eigen::SparseMatrix<T> SparseMatrixT;
+  typedef Eigen::Matrix<T, Eigen::Dynamic, 1> Vector;
+
+  const int n = 50;
+  // 1D Laplacian: SPD, fully nonzero diagonal.
+  std::vector<Eigen::Triplet<T>> triplets;
+  for (int i = 0; i < n; ++i) {
+    triplets.emplace_back(i, i, T(2));
+    if (i > 0) triplets.emplace_back(i, i - 1, T(-1));
+    if (i + 1 < n) triplets.emplace_back(i, i + 1, T(-1));
+  }
+  SparseMatrixT base(n, n);
+  base.setFromTriplets(triplets.begin(), triplets.end());
+
+  // Cyclic-shift row permutation by 2: separates the natural diagonal from
+  // the tridiagonal nonzero band, so all diagonal entries of (Pshift * base)
+  // are zero. A valid row matching must find the inverse shift.
+  Eigen::PermutationMatrix<Eigen::Dynamic> Pshift(n);
+  for (int i = 0; i < n; ++i) Pshift.indices()(i) = (i + 2) % n;
+  SparseMatrixT A = Pshift * base;
+
+  // Verify the test setup: every diagonal is zero.
+  int nz_diag = 0;
+  for (int i = 0; i < n; ++i)
+    if (A.coeff(i, i) != T(0)) ++nz_diag;
+  VERIFY(nz_diag == 0);
+
+  // BiCGSTAB + IncompleteLUT must converge to a reasonable residual.
+  Vector b = Vector::Random(n);
+  Eigen::BiCGSTAB<SparseMatrixT, Eigen::IncompleteLUT<T>> solver;
+  solver.setTolerance(typename Eigen::NumTraits<T>::Real(16) * Eigen::NumTraits<T>::epsilon());
+  solver.compute(A);
+  VERIFY(solver.preconditioner().info() == Eigen::Success);
+  Vector x = solver.solve(b);
+  VERIFY(solver.info() == Eigen::Success);
+  Vector residual = b - A * x;
+  // Solver was set to tol = 16*eps; allow some slack for the residual check.
+  typename Eigen::NumTraits<T>::Real residual_bound =
+      typename Eigen::NumTraits<T>::Real(1024) * Eigen::NumTraits<T>::epsilon() * b.norm();
+  VERIFY(residual.norm() < residual_bound);
+}
+
+// A structurally singular matrix (empty row) cannot be made diagonal-nonzero
+// by any row permutation; the factorization must report NumericalIssue.
+// This covers the rownorm == 0 early-return path.
+template <typename T>
+void test_structurally_singular() {
+  typedef Eigen::SparseMatrix<T> SparseMatrixT;
+  std::vector<Eigen::Triplet<T>> triplets;
+  triplets.emplace_back(0, 0, T(2));
+  triplets.emplace_back(0, 1, T(1));
+  triplets.emplace_back(1, 0, T(1));
+  triplets.emplace_back(1, 1, T(2));
+  // row 2 is intentionally empty
+  triplets.emplace_back(3, 0, T(1));
+  triplets.emplace_back(3, 3, T(2));
+  SparseMatrixT A(4, 4);
+  A.setFromTriplets(triplets.begin(), triplets.end());
+
+  Eigen::IncompleteLUT<T> ilut;
+  ilut.compute(A);
+  VERIFY(ilut.info() == Eigen::NumericalIssue);
+}
+
+// A matrix where every row is structurally non-empty (so rownorm != 0) but
+// no full bipartite matching exists, forcing at least one pivot to be shifted.
+// Rows 0 and 1 only have entries in column 0, so columns 1 and 2 cannot both
+// be matched. This exercises the shifted-pivot path that flips info() to
+// NumericalIssue.
+template <typename T>
+void test_zero_pivot_numerical_issue() {
+  typedef Eigen::SparseMatrix<T> SparseMatrixT;
+  SparseMatrixT A(3, 3);
+  std::vector<Eigen::Triplet<T>> triplets;
+  triplets.emplace_back(0, 0, T(1));
+  triplets.emplace_back(1, 0, T(2));  // row 1 competes with row 0 for column 0
+  triplets.emplace_back(2, 2, T(3));
+  A.setFromTriplets(triplets.begin(), triplets.end());
+
+  Eigen::IncompleteLUT<T> ilut;
+  ilut.compute(A);
+  VERIFY(ilut.info() == Eigen::NumericalIssue);
+}
+
+// analyzePattern() must depend only on the stored sparsity pattern and not on
+// numerical values. Otherwise, the two-step API contract breaks: the same
+// analysis would no longer be reusable for any matrix sharing this stored
+// pattern. Repro: analyze a pattern with all-zero placeholder values, then
+// factorize a numerically nonzero matrix sharing that pattern.
+template <typename T>
+void test_pattern_value_separation() {
+  typedef Eigen::SparseMatrix<T> SparseMatrixT;
+  SparseMatrixT pattern(2, 2);
+  pattern.insert(0, 1) = T(0);
+  pattern.insert(1, 0) = T(0);
+  pattern.makeCompressed();
+
+  SparseMatrixT A(2, 2);
+  A.insert(0, 1) = T(1);
+  A.insert(1, 0) = T(1);
+  A.makeCompressed();
+
+  Eigen::IncompleteLUT<T> ilut;
+  ilut.analyzePattern(pattern);
+  ilut.factorize(A);
+  VERIFY(ilut.info() == Eigen::Success);
+}
+
 EIGEN_DECLARE_TEST(incomplete_LUT) {
   CALL_SUBTEST_1((test_incompleteLUT_T<double, int>()));
   CALL_SUBTEST_1((test_incompleteLUT_T<float, int>()));
@@ -86,4 +201,10 @@
 
   CALL_SUBTEST_4(test_extract_LU<double>());
   CALL_SUBTEST_4(test_extract_LU<float>());
+
+  CALL_SUBTEST_5(test_zero_diagonal_2626<double>());
+  CALL_SUBTEST_5(test_zero_diagonal_2626<float>());
+  CALL_SUBTEST_5(test_structurally_singular<double>());
+  CALL_SUBTEST_5(test_zero_pivot_numerical_issue<double>());
+  CALL_SUBTEST_5(test_pattern_value_separation<double>());
 }