Implement blocked tridiagonalization for large matrices

libeigen/eigen!2482

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index f54c444..1dbe009 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -310,7 +310,8 @@
 namespace internal {
 
 /** \internal
- * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+ * Unblocked tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+ * Processes one column at a time using Level 2 BLAS operations (SYMV, SYR2).
  *
  * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced.
  *                     On output, the strict upper part is left unchanged, and the lower triangular part
@@ -333,7 +334,7 @@
  * \sa Tridiagonalization::packedMatrix()
  */
 template <typename MatrixType, typename CoeffVectorType>
-EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) {
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace_unblocked(MatrixType& matA, CoeffVectorType& hCoeffs) {
   using numext::conj;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
@@ -368,6 +369,159 @@
   }
 }
 
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+/** \internal
+ * Blocked tridiagonal decomposition (analogous to LAPACK's dsytrd/dlatrd).
+ * Processes columns in panels of BlockSize, accumulating Householder reflectors
+ * and deferring the symmetric rank-2k update to use Level 3 BLAS (triangular GEMM).
+ * Falls back to the unblocked algorithm for the last (partial) panel.
+ */
+template <typename MatrixType, typename CoeffVectorType>
+void tridiagonalization_inplace_blocked(MatrixType& matA, CoeffVectorType& hCoeffs, Index nb = 16) {
+  using numext::conj;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  const Index n = matA.rows();
+  eigen_assert(n == matA.cols());
+  eigen_assert(n == hCoeffs.size() + 1);
+  eigen_assert(nb >= 2 && nb < n);
+
+  enum {
+    StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+    RhsStorageOrder = (StorageOrder == ColMajor) ? RowMajor : ColMajor
+  };
+
+  // Workspace: W matrix (n x nb) for deferred update vectors, temp vector (nb) for GEMV, betas (nb).
+  typedef Matrix<Scalar, Dynamic, Dynamic, StorageOrder> WorkMatrixType;
+  WorkMatrixType W(n, nb);
+  Matrix<Scalar, Dynamic, 1> temp(nb);
+  Matrix<RealScalar, Dynamic, 1> betas(nb);
+
+  // Pre-allocate GEMM blocking workspace for the largest trailing matrix (first panel).
+  // Reused across all panels to avoid repeated heap allocations.
+  typedef gemm_blocking_space<StorageOrder, Scalar, Scalar, Dynamic, Dynamic, Dynamic> BlockingType;
+  const Index maxTrailingSize = n - nb;
+  BlockingType blocking(maxTrailingSize, maxTrailingSize, nb, 1, false);
+
+  Index j0 = 0;
+  for (; j0 + nb < n - 1; j0 += nb) {
+    const Index j_end = j0 + nb;
+
+    // ---- Panel factorization (dlatrd) ----
+    // Process columns j0..j_end-1, computing Householder vectors (stored in matA)
+    // and update vectors (stored in W). The rank-2k update to the trailing
+    // submatrix is deferred until after the panel.
+    for (Index j = j0; j < j_end; ++j) {
+      const Index local_j = j - j0;
+      const Index remainingSize = n - j - 1;
+
+      // Step 1: Update column j for deferred rank-2 updates from columns j0..j-1.
+      // A(j:n-1, j) -= V * W(j,:)^H + W * V(j,:)^H
+      // where V = matA(j:n-1, j0:j-1) holds Householder vectors,
+      // and W(j:n-1, 0:lj-1) holds the corresponding update vectors.
+      if (local_j > 0) {
+        auto col_j = matA.col(j).segment(j, n - j);
+        col_j.noalias() -= matA.block(j, j0, n - j, local_j) * W.row(j).head(local_j).adjoint();
+        col_j.noalias() -= W.block(j, 0, n - j, local_j) * matA.row(j).segment(j0, local_j).adjoint();
+        // Keep diagonal real (for complex scalars; no-op for real).
+        matA.coeffRef(j, j) = numext::real(matA.coeff(j, j));
+      }
+
+      // Step 2: Compute Householder reflector for column j.
+      RealScalar beta;
+      Scalar h;
+      matA.col(j).tail(remainingSize).makeHouseholderInPlace(h, beta);
+      betas(local_j) = beta;
+      matA.col(j).coeffRef(j + 1) = Scalar(1);
+
+      auto v = matA.col(j).tail(remainingSize);
+      auto w = W.col(local_j).tail(remainingSize);
+
+      // Step 3: Compute w = conj(h) * A_eff * v where A_eff accounts for deferred updates.
+      // Start with SYMV on the stored (not yet updated) trailing submatrix.
+      w.noalias() =
+          matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() * (conj(h) * v);
+
+      // GEMV corrections for deferred rank-2 updates within this panel.
+      if (local_j > 0) {
+        auto V_prev = matA.block(j + 1, j0, remainingSize, local_j);
+        auto W_prev = W.block(j + 1, 0, remainingSize, local_j);
+
+        // w -= conj(h) * V_prev * (W_prev^H * v)
+        temp.head(local_j).noalias() = W_prev.adjoint() * v;
+        w.noalias() -= conj(h) * (V_prev * temp.head(local_j));
+
+        // w -= conj(h) * W_prev * (V_prev^H * v)
+        temp.head(local_j).noalias() = V_prev.adjoint() * v;
+        w.noalias() -= conj(h) * (W_prev * temp.head(local_j));
+      }
+
+      // Step 4: Half-dot correction: w -= 0.5 * conj(h) * (w^H * v) * v
+      w += (conj(h) * RealScalar(-0.5) * w.dot(v)) * v;
+
+      hCoeffs.coeffRef(j) = h;
+    }
+
+    // ---- Apply rank-2k update to trailing submatrix ----
+    // A(j_end:n-1, j_end:n-1) -= V_trail * W_trail^H + W_trail * V_trail^H
+    // using Level 3 BLAS (triangular GEMM).
+    const Index trailingSize = n - j_end;
+    if (trailingSize > 0) {
+      const Scalar* V_data = &matA.coeffRef(j_end, j0);
+      const Scalar* W_data = &W.coeffRef(j_end, 0);
+      Scalar* C_data = &matA.coeffRef(j_end, j_end);
+      const Index V_stride = matA.outerStride();
+      const Index W_stride = W.outerStride();
+      const Index C_stride = matA.outerStride();
+
+      // C -= V * W^H
+      general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder,
+                                               NumTraits<Scalar>::IsComplex, StorageOrder, 1,
+                                               Lower>::run(trailingSize, nb, V_data, V_stride, W_data, W_stride, C_data,
+                                                           1, C_stride, Scalar(-1), blocking);
+
+      // C -= W * V^H
+      general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder,
+                                               NumTraits<Scalar>::IsComplex, StorageOrder, 1,
+                                               Lower>::run(trailingSize, nb, W_data, W_stride, V_data, V_stride, C_data,
+                                                           1, C_stride, Scalar(-1), blocking);
+    }
+
+    // Restore subdiagonal entries (overwritten with 1 for Householder vectors).
+    for (Index j = j0; j < j_end; ++j) {
+      matA.coeffRef(j + 1, j) = betas(j - j0);
+    }
+  }
+
+  // ---- Process remaining columns with unblocked algorithm ----
+  if (j0 < n - 1) {
+    const Index remaining = n - j0;
+    auto trailing = matA.bottomRightCorner(remaining, remaining);
+    auto hCoeffs_tail = hCoeffs.segment(j0, remaining - 1);
+    tridiagonalization_inplace_unblocked(trailing, hCoeffs_tail);
+  }
+}
+#endif  // !EIGEN_GPU_COMPILE_PHASE
+
+/** \internal
+ * Dispatches to blocked or unblocked tridiagonalization based on matrix size.
+ * On GPU, always uses the unblocked algorithm.
+ */
+template <typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) {
+  Index n = matA.rows();
+  eigen_assert(n == matA.cols());
+  eigen_assert(n == hCoeffs.size() + 1 || n == 1);
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+  if ((MatrixType::RowsAtCompileTime == Dynamic || MatrixType::ColsAtCompileTime == Dynamic) && n >= 96) {
+    tridiagonalization_inplace_blocked(matA, hCoeffs);
+    return;
+  }
+#endif
+  tridiagonalization_inplace_unblocked(matA, hCoeffs);
+}
+
 // forward declaration, implementation at the end of this file
 template <typename MatrixType, int Size = MatrixType::ColsAtCompileTime,
           bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index a2339ca..9f50fcb 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -938,6 +938,21 @@
 
     // RowMajor
     CALL_SUBTEST_19(selfadjointeigensolver_rowmajor<0>());
+
+    // Larger matrices to exercise the blocked tridiagonalization path (n >= 96).
+    CALL_SUBTEST_4(selfadjointeigensolver(MatrixXd(256, 256)));
+    CALL_SUBTEST_5(selfadjointeigensolver(MatrixXcd(256, 256)));
+    CALL_SUBTEST_3(selfadjointeigensolver(MatrixXf(256, 256)));
+    CALL_SUBTEST_9(selfadjointeigensolver(Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor>(256, 256)));
+
+    // Deterministic blocked-path stress: clustered + extreme spectra at n=128
+    // (>= the n>=96 blocking threshold). The repeated/near-repeated and high-
+    // dynamic-range cases are the ones most likely to expose a stability
+    // regression in the rank-2k trailing update.
+    CALL_SUBTEST_4(selfadjointeigensolver_repeated_eigenvalues(MatrixXd(128, 128)));
+    CALL_SUBTEST_4(selfadjointeigensolver_extreme_eigenvalues(MatrixXd(128, 128)));
+    CALL_SUBTEST_3(selfadjointeigensolver_repeated_eigenvalues(MatrixXf(128, 128)));
+    CALL_SUBTEST_5(selfadjointeigensolver_repeated_eigenvalues(MatrixXcd(128, 128)));
   }
 
   CALL_SUBTEST_17(bug_854<0>());
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp
index c9956d3..b4a873e 100644
--- a/test/nomalloc.cpp
+++ b/test/nomalloc.cpp
@@ -156,6 +156,17 @@
   jSVD.compute(A);
 }
 
+template <typename Scalar>
+void selfadjoint_eigensolver_large_fixed_no_malloc() {
+  typedef Eigen::Matrix<Scalar, 96, 96> Matrix;
+  Matrix A = Matrix::Random();
+  Matrix saA = A.adjoint() * A;
+
+  Eigen::SelfAdjointEigenSolver<Matrix> solver;
+  solver.compute(saA);
+  VERIFY_IS_EQUAL(solver.info(), Eigen::Success);
+}
+
 void test_zerosized() {
   // default constructors:
   Eigen::MatrixXd A;
@@ -219,6 +230,8 @@
 
   // Check decomposition modules with dynamic matrices that have a known compile-time max size (ctms)
   CALL_SUBTEST_4(ctms_decompositions<float>());
+  CALL_SUBTEST_4(selfadjoint_eigensolver_large_fixed_no_malloc<float>());
+  CALL_SUBTEST_4(selfadjoint_eigensolver_large_fixed_no_malloc<double>());
 
   CALL_SUBTEST_5(test_zerosized());