Implement blocked tridiagonalization for large matrices libeigen/eigen!2482 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index f54c444..1dbe009 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -310,7 +310,8 @@ namespace internal { /** \internal - * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place. + * Unblocked tridiagonal decomposition of the selfadjoint matrix \a matA in-place. + * Processes one column at a time using Level 2 BLAS operations (SYMV, SYR2). * * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced. * On output, the strict upper part is left unchanged, and the lower triangular part @@ -333,7 +334,7 @@ * \sa Tridiagonalization::packedMatrix() */ template <typename MatrixType, typename CoeffVectorType> -EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { +EIGEN_DEVICE_FUNC void tridiagonalization_inplace_unblocked(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; @@ -368,6 +369,159 @@ } } +#if !defined(EIGEN_GPU_COMPILE_PHASE) +/** \internal + * Blocked tridiagonal decomposition (analogous to LAPACK's dsytrd/dlatrd). + * Processes columns in panels of BlockSize, accumulating Householder reflectors + * and deferring the symmetric rank-2k update to use Level 3 BLAS (triangular GEMM). + * Falls back to the unblocked algorithm for the last (partial) panel. + */ +template <typename MatrixType, typename CoeffVectorType> +void tridiagonalization_inplace_blocked(MatrixType& matA, CoeffVectorType& hCoeffs, Index nb = 16) { + using numext::conj; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + const Index n = matA.rows(); + eigen_assert(n == matA.cols()); + eigen_assert(n == hCoeffs.size() + 1); + eigen_assert(nb >= 2 && nb < n); + + enum { + StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor, + RhsStorageOrder = (StorageOrder == ColMajor) ? RowMajor : ColMajor + }; + + // Workspace: W matrix (n x nb) for deferred update vectors, temp vector (nb) for GEMV, betas (nb). + typedef Matrix<Scalar, Dynamic, Dynamic, StorageOrder> WorkMatrixType; + WorkMatrixType W(n, nb); + Matrix<Scalar, Dynamic, 1> temp(nb); + Matrix<RealScalar, Dynamic, 1> betas(nb); + + // Pre-allocate GEMM blocking workspace for the largest trailing matrix (first panel). + // Reused across all panels to avoid repeated heap allocations. + typedef gemm_blocking_space<StorageOrder, Scalar, Scalar, Dynamic, Dynamic, Dynamic> BlockingType; + const Index maxTrailingSize = n - nb; + BlockingType blocking(maxTrailingSize, maxTrailingSize, nb, 1, false); + + Index j0 = 0; + for (; j0 + nb < n - 1; j0 += nb) { + const Index j_end = j0 + nb; + + // ---- Panel factorization (dlatrd) ---- + // Process columns j0..j_end-1, computing Householder vectors (stored in matA) + // and update vectors (stored in W). The rank-2k update to the trailing + // submatrix is deferred until after the panel. + for (Index j = j0; j < j_end; ++j) { + const Index local_j = j - j0; + const Index remainingSize = n - j - 1; + + // Step 1: Update column j for deferred rank-2 updates from columns j0..j-1. + // A(j:n-1, j) -= V * W(j,:)^H + W * V(j,:)^H + // where V = matA(j:n-1, j0:j-1) holds Householder vectors, + // and W(j:n-1, 0:lj-1) holds the corresponding update vectors. + if (local_j > 0) { + auto col_j = matA.col(j).segment(j, n - j); + col_j.noalias() -= matA.block(j, j0, n - j, local_j) * W.row(j).head(local_j).adjoint(); + col_j.noalias() -= W.block(j, 0, n - j, local_j) * matA.row(j).segment(j0, local_j).adjoint(); + // Keep diagonal real (for complex scalars; no-op for real). + matA.coeffRef(j, j) = numext::real(matA.coeff(j, j)); + } + + // Step 2: Compute Householder reflector for column j. + RealScalar beta; + Scalar h; + matA.col(j).tail(remainingSize).makeHouseholderInPlace(h, beta); + betas(local_j) = beta; + matA.col(j).coeffRef(j + 1) = Scalar(1); + + auto v = matA.col(j).tail(remainingSize); + auto w = W.col(local_j).tail(remainingSize); + + // Step 3: Compute w = conj(h) * A_eff * v where A_eff accounts for deferred updates. + // Start with SYMV on the stored (not yet updated) trailing submatrix. + w.noalias() = + matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() * (conj(h) * v); + + // GEMV corrections for deferred rank-2 updates within this panel. + if (local_j > 0) { + auto V_prev = matA.block(j + 1, j0, remainingSize, local_j); + auto W_prev = W.block(j + 1, 0, remainingSize, local_j); + + // w -= conj(h) * V_prev * (W_prev^H * v) + temp.head(local_j).noalias() = W_prev.adjoint() * v; + w.noalias() -= conj(h) * (V_prev * temp.head(local_j)); + + // w -= conj(h) * W_prev * (V_prev^H * v) + temp.head(local_j).noalias() = V_prev.adjoint() * v; + w.noalias() -= conj(h) * (W_prev * temp.head(local_j)); + } + + // Step 4: Half-dot correction: w -= 0.5 * conj(h) * (w^H * v) * v + w += (conj(h) * RealScalar(-0.5) * w.dot(v)) * v; + + hCoeffs.coeffRef(j) = h; + } + + // ---- Apply rank-2k update to trailing submatrix ---- + // A(j_end:n-1, j_end:n-1) -= V_trail * W_trail^H + W_trail * V_trail^H + // using Level 3 BLAS (triangular GEMM). + const Index trailingSize = n - j_end; + if (trailingSize > 0) { + const Scalar* V_data = &matA.coeffRef(j_end, j0); + const Scalar* W_data = &W.coeffRef(j_end, 0); + Scalar* C_data = &matA.coeffRef(j_end, j_end); + const Index V_stride = matA.outerStride(); + const Index W_stride = W.outerStride(); + const Index C_stride = matA.outerStride(); + + // C -= V * W^H + general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder, + NumTraits<Scalar>::IsComplex, StorageOrder, 1, + Lower>::run(trailingSize, nb, V_data, V_stride, W_data, W_stride, C_data, + 1, C_stride, Scalar(-1), blocking); + + // C -= W * V^H + general_matrix_matrix_triangular_product<Index, Scalar, StorageOrder, false, Scalar, RhsStorageOrder, + NumTraits<Scalar>::IsComplex, StorageOrder, 1, + Lower>::run(trailingSize, nb, W_data, W_stride, V_data, V_stride, C_data, + 1, C_stride, Scalar(-1), blocking); + } + + // Restore subdiagonal entries (overwritten with 1 for Householder vectors). + for (Index j = j0; j < j_end; ++j) { + matA.coeffRef(j + 1, j) = betas(j - j0); + } + } + + // ---- Process remaining columns with unblocked algorithm ---- + if (j0 < n - 1) { + const Index remaining = n - j0; + auto trailing = matA.bottomRightCorner(remaining, remaining); + auto hCoeffs_tail = hCoeffs.segment(j0, remaining - 1); + tridiagonalization_inplace_unblocked(trailing, hCoeffs_tail); + } +} +#endif // !EIGEN_GPU_COMPILE_PHASE + +/** \internal + * Dispatches to blocked or unblocked tridiagonalization based on matrix size. + * On GPU, always uses the unblocked algorithm. + */ +template <typename MatrixType, typename CoeffVectorType> +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { + Index n = matA.rows(); + eigen_assert(n == matA.cols()); + eigen_assert(n == hCoeffs.size() + 1 || n == 1); + +#if !defined(EIGEN_GPU_COMPILE_PHASE) + if ((MatrixType::RowsAtCompileTime == Dynamic || MatrixType::ColsAtCompileTime == Dynamic) && n >= 96) { + tridiagonalization_inplace_blocked(matA, hCoeffs); + return; + } +#endif + tridiagonalization_inplace_unblocked(matA, hCoeffs); +} + // forward declaration, implementation at the end of this file template <typename MatrixType, int Size = MatrixType::ColsAtCompileTime, bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index a2339ca..9f50fcb 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp
@@ -938,6 +938,21 @@ // RowMajor CALL_SUBTEST_19(selfadjointeigensolver_rowmajor<0>()); + + // Larger matrices to exercise the blocked tridiagonalization path (n >= 96). + CALL_SUBTEST_4(selfadjointeigensolver(MatrixXd(256, 256))); + CALL_SUBTEST_5(selfadjointeigensolver(MatrixXcd(256, 256))); + CALL_SUBTEST_3(selfadjointeigensolver(MatrixXf(256, 256))); + CALL_SUBTEST_9(selfadjointeigensolver(Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor>(256, 256))); + + // Deterministic blocked-path stress: clustered + extreme spectra at n=128 + // (>= the n>=96 blocking threshold). The repeated/near-repeated and high- + // dynamic-range cases are the ones most likely to expose a stability + // regression in the rank-2k trailing update. + CALL_SUBTEST_4(selfadjointeigensolver_repeated_eigenvalues(MatrixXd(128, 128))); + CALL_SUBTEST_4(selfadjointeigensolver_extreme_eigenvalues(MatrixXd(128, 128))); + CALL_SUBTEST_3(selfadjointeigensolver_repeated_eigenvalues(MatrixXf(128, 128))); + CALL_SUBTEST_5(selfadjointeigensolver_repeated_eigenvalues(MatrixXcd(128, 128))); } CALL_SUBTEST_17(bug_854<0>());
diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index c9956d3..b4a873e 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp
@@ -156,6 +156,17 @@ jSVD.compute(A); } +template <typename Scalar> +void selfadjoint_eigensolver_large_fixed_no_malloc() { + typedef Eigen::Matrix<Scalar, 96, 96> Matrix; + Matrix A = Matrix::Random(); + Matrix saA = A.adjoint() * A; + + Eigen::SelfAdjointEigenSolver<Matrix> solver; + solver.compute(saA); + VERIFY_IS_EQUAL(solver.info(), Eigen::Success); +} + void test_zerosized() { // default constructors: Eigen::MatrixXd A; @@ -219,6 +230,8 @@ // Check decomposition modules with dynamic matrices that have a known compile-time max size (ctms) CALL_SUBTEST_4(ctms_decompositions<float>()); + CALL_SUBTEST_4(selfadjoint_eigensolver_large_fixed_no_malloc<float>()); + CALL_SUBTEST_4(selfadjoint_eigensolver_large_fixed_no_malloc<double>()); CALL_SUBTEST_5(test_zerosized());