| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| // SPDX-License-Identifier: MPL-2.0 |
| |
| #ifndef EIGEN_TRIDIAGONAL_BISECTION_H |
| #define EIGEN_TRIDIAGONAL_BISECTION_H |
| |
| // IWYU pragma: private |
| #include "./InternalHeaderCheck.h" |
| |
| namespace Eigen { |
| |
| /** \eigenvalues_module \ingroup Eigenvalues_Module |
| * |
| * \brief Selects which eigenvalues to compute in a spectral-bisection solve. |
| * |
| * Passed to TridiagonalEigenSolver::computeEigenvalues(). The Sturm-sequence |
| * bisection algorithm can compute an arbitrary contiguous subset of the |
| * spectrum, which the implicit-QR algorithm cannot. Three modes are provided: |
| * - all(): every eigenvalue (the default), |
| * - indices(il, iu): the eigenvalues with 0-based ascending indices in the |
| * half-open range [il, iu) (i.e. the (iu - il) smallest starting at il), |
| * - values(vl, vu): the eigenvalues lying in the half-open interval [vl, vu) |
| * (lower-closed, upper-open, like indices(); an eigenvalue exactly equal to |
| * vl is selected, one equal to vu is not). |
| * |
| * In every mode the computed eigenvalues are returned in non-decreasing order. |
| */ |
| struct EigenvalueRange { |
| enum Kind { All, ByIndex, ByValue }; |
| |
| Kind kind; |
| Index il; |
| Index iu; |
| // Endpoints are held as long double so values() does not silently collapse two nearby endpoints |
| // to the same number when the solver's RealScalar is long double (they are narrowed on use). |
| long double vl; |
| long double vu; |
| |
| /** \returns a selector for the entire spectrum. */ |
| static EigenvalueRange all() { return EigenvalueRange{All, 0, 0, 0.0L, 0.0L}; } |
| |
| /** \returns a selector for the eigenvalues with 0-based indices in [il, iu). */ |
| static EigenvalueRange indices(Index il, Index iu) { return EigenvalueRange{ByIndex, il, iu, 0.0L, 0.0L}; } |
| |
| /** \returns a selector for the eigenvalues in the half-open interval [vl, vu). */ |
| static EigenvalueRange values(long double vl, long double vu) { return EigenvalueRange{ByValue, 0, 0, vl, vu}; } |
| }; |
| |
| namespace internal { |
| |
| /** \internal |
| * |
| * Evaluates Sturm sequences for a real symmetric tridiagonal matrix T at a |
| * batch of shift points, using packet math to process many points in parallel. |
| * |
| * For a shift \c x the routine computes the number of eigenvalues of T that are |
| * less than \c x via the recurrence |
| * \f$ q_0 = \alpha_0 - x,\quad q_i = (\alpha_i - \beta_{i-1}^2 / q_{i-1}) - x \f$, |
| * counting how many pivots \f$ q_i \f$ are non-positive. To avoid division by |
| * zero and overflow, every pivot that drops to or below \c pivmin is treated as |
| * a (small) negative pivot, counted, and clamped to \c -pivmin. The same guard |
| * is applied to every pivot including the first, so the scalar tail produces |
| * results bit-identical to the vectorized body. |
| * |
| * \param[in] alpha diagonal of T, length \c n. |
| * \param[in] beta_sq squared off-diagonal of T, \f$ \beta_i^2 \f$, length \c n-1. |
| * \param[in] n dimension of T. |
| * \param[in] pivmin smallest allowed pivot magnitude, strictly positive, |
| * e.g. \c safemin * max_i beta_i^2 (floored to \c safemin). |
| * \param[in] eval_points the \c num_points shift values at which to evaluate. |
| * \param[out] count on output \c count[j] is the number of eigenvalues of T |
| * less than \c eval_points[j]. Must have room for |
| * \c num_points entries. |
| */ |
| // Evaluates the Sturm count for \c kUnroll packets of shift points at once, fully unrolled so |
| // the per-lane state (x, q, running count) stays in vector registers. See tridiagonal_sturm_counts(). |
| template <int kUnroll, typename RealScalar> |
| EIGEN_STRONG_INLINE void tridiagonal_sturm_block(const RealScalar* alpha, const RealScalar* beta_sq, Index n, |
| RealScalar pivmin, const RealScalar* eval_points, RealScalar* count, |
| Index start) { |
| typedef typename packet_traits<RealScalar>::type Packet; |
| constexpr int kPacketSize = unpacket_traits<Packet>::size; |
| const Packet pivmin_p = pset1<Packet>(pivmin); |
| const Packet neg_pivmin_p = pset1<Packet>(-pivmin); |
| const Packet one_p = pset1<Packet>(RealScalar(1)); |
| |
| Packet x[kUnroll], q[kUnroll], c[kUnroll]; |
| const Packet alpha_0 = pset1<Packet>(alpha[0]); |
| EIGEN_UNROLL_LOOP |
| for (int k = 0; k < kUnroll; ++k) { |
| x[k] = ploadu<Packet>(eval_points + start + k * kPacketSize); |
| q[k] = psub(alpha_0, x[k]); |
| const Packet mask = pcmp_le(q[k], pivmin_p); |
| c[k] = pand(one_p, mask); |
| q[k] = pselect(mask, pmin(q[k], neg_pivmin_p), q[k]); |
| } |
| for (Index i = 1; i < n; ++i) { |
| const Packet alpha_i = pset1<Packet>(alpha[i]); |
| const Packet beta_sq_im1 = pset1<Packet>(beta_sq[i - 1]); |
| EIGEN_UNROLL_LOOP |
| for (int k = 0; k < kUnroll; ++k) { |
| // q = (alpha_i - beta_{i-1}^2 / q) - x. |
| q[k] = psub(psub(alpha_i, pdiv(beta_sq_im1, q[k])), x[k]); |
| const Packet mask = pcmp_le(q[k], pivmin_p); |
| c[k] = padd(c[k], pand(one_p, mask)); |
| q[k] = pselect(mask, pmin(q[k], neg_pivmin_p), q[k]); |
| } |
| } |
| EIGEN_UNROLL_LOOP |
| for (int k = 0; k < kUnroll; ++k) pstoreu(count + start + k * kPacketSize, c[k]); |
| } |
| |
| template <typename RealScalar> |
| void tridiagonal_sturm_counts(const RealScalar* alpha, const RealScalar* beta_sq, Index n, RealScalar pivmin, |
| const RealScalar* eval_points, RealScalar* count, Index num_points) { |
| typedef typename packet_traits<RealScalar>::type Packet; |
| constexpr Index kPacketSize = Index(unpacket_traits<Packet>::size); |
| // num_points is a point count and so never negative; clearing the sign bit makes that explicit to |
| // the compiler, which lets the division by the power-of-two kPacketSize lower to a shift (and tidies |
| // the trailing cascade loops) instead of emitting signed-division sign-correction code. |
| num_points &= (std::numeric_limits<Index>::max)(); |
| const Index full = num_points / kPacketSize; // number of whole packets |
| |
| // Process whole packets in register-resident blocks, cascading 8 -> 4 -> 2 -> 1 so that even a |
| // small batch gets enough independent recurrences to hide the pdiv latency. Eight packets is the |
| // measured sweet spot on AVX2 (more would spill the per-lane state out of vector registers). |
| Index p = 0; |
| for (; p + 8 <= full; p += 8) |
| tridiagonal_sturm_block<8>(alpha, beta_sq, n, pivmin, eval_points, count, p * kPacketSize); |
| for (; p + 4 <= full; p += 4) |
| tridiagonal_sturm_block<4>(alpha, beta_sq, n, pivmin, eval_points, count, p * kPacketSize); |
| for (; p + 2 <= full; p += 2) |
| tridiagonal_sturm_block<2>(alpha, beta_sq, n, pivmin, eval_points, count, p * kPacketSize); |
| for (; p + 1 <= full; p += 1) |
| tridiagonal_sturm_block<1>(alpha, beta_sq, n, pivmin, eval_points, count, p * kPacketSize); |
| |
| // Scalar tail for the remaining (< kPacketSize) points. Bit-identical to the packet path above. |
| for (Index j = full * kPacketSize; j < num_points; ++j) { |
| const RealScalar xj = eval_points[j]; |
| RealScalar qj = alpha[0] - xj; |
| RealScalar cj = RealScalar(0); |
| if (qj <= pivmin) { |
| cj += RealScalar(1); |
| qj = numext::mini(qj, -pivmin); |
| } |
| for (Index i = 1; i < n; ++i) { |
| qj = (alpha[i] - beta_sq[i - 1] / qj) - xj; |
| if (qj <= pivmin) { |
| cj += RealScalar(1); |
| qj = numext::mini(qj, -pivmin); |
| } |
| } |
| count[j] = cj; |
| } |
| } |
| |
| /** \internal |
| * |
| * Runs the (\c t_hi - \c t_lo) independent bisections for a contiguous block of |
| * target eigenvalue indices in lock-step, writing the converged shift midpoints |
| * (in the internally normalized scale) to \c out[0 .. t_hi-t_lo). |
| * |
| * This is the unit of parallel work in tridiagonal_bisection(): the bisections of |
| * two disjoint index blocks share no state, so each thread owns one block and the |
| * blocks communicate only at the join. The vectorized batch evaluator keeps every |
| * lane of every active bracket busy within a block. |
| * |
| * \param[in] alpha normalized diagonal of T (length \c n). |
| * \param[in] beta_sq normalized squared off-diagonal (length \c n-1). |
| * \param[in] n dimension of T. |
| * \param[in] pivmin smallest allowed pivot magnitude (see tridiagonal_sturm_counts()). |
| * \param[in] bracket_lo lower end of the initial search bracket (count == 0 below it). |
| * \param[in] bracket_hi upper end of the initial search bracket (count == n above it). |
| * \param[in] t_lo, t_hi 0-based target indices for this block, half-open [t_lo, t_hi). |
| * \param[in] max_iters maximum number of bisection steps. |
| * \param[in] abs_tol absolute width below which a bracket is considered converged. |
| * \param[out] out converged midpoints for indices [t_lo, t_hi); length t_hi-t_lo. |
| */ |
| template <typename RealScalar> |
| void tridiagonal_bisection_block(const RealScalar* alpha, const RealScalar* beta_sq, Index n, RealScalar pivmin, |
| RealScalar bracket_lo, RealScalar bracket_hi, Index t_lo, Index t_hi, int max_iters, |
| RealScalar abs_tol, RealScalar* out) { |
| typedef Array<RealScalar, Dynamic, 1> ArrayType; |
| const Index m = t_hi - t_lo; |
| if (m <= 0) return; |
| |
| // The eigenvalue with 0-based index i is the value where count(x) crosses from <= i to > i. |
| ArrayType lower = ArrayType::Constant(m, bracket_lo); |
| ArrayType upper = ArrayType::Constant(m, bracket_hi); |
| const ArrayType targets = ArrayType::LinSpaced(m, RealScalar(t_lo), RealScalar(t_hi - 1)); |
| ArrayType counts(m); |
| ArrayType mid = RealScalar(0.5) * (lower + upper); |
| |
| // Each eigenvalue is recorded the iteration it first converges, using a per-element criterion that |
| // depends only on that element's own bracket. This makes the result independent of how the index |
| // range is grouped -- in particular bitwise identical for any number of threads -- because element |
| // i's converged value never depends on when its neighbors finish. (A single global convergence test |
| // would instead keep refining an already-converged eigenvalue until the slowest one in its group |
| // caught up, so the last bits would shift with the thread count.) |
| ArrayType result = mid; |
| Array<bool, Dynamic, 1> done = Array<bool, Dynamic, 1>::Constant(m, false); |
| |
| // In the early iterations many eigenvalues still share a bracket, so their (sorted) midpoints are |
| // identical and the Sturm count need only be evaluated at the distinct values and scattered back. |
| // Once every midpoint is distinct the brackets only ever get finer, so we stop deduplicating and |
| // evaluate all midpoints directly, avoiding the per-iteration bookkeeping (and its unpredictable |
| // branch). Deduplication never changes the result: count(mid[i]) is unchanged. It only pays off |
| // when there are many more points than packets (otherwise the per-point kernel cost is too small |
| // to offset the bookkeeping), so the scratch is allocated only then. |
| constexpr int kPacketSize = unpacket_traits<typename packet_traits<RealScalar>::type>::size; |
| bool deduplicate = (m >= 64 * Index(kPacketSize)); |
| ArrayType distinct, counts_distinct; |
| if (deduplicate) { |
| distinct.resize(m); |
| counts_distinct.resize(m); |
| } |
| for (int iter = 0; iter < max_iters; ++iter) { |
| if (deduplicate) { |
| const RealScalar* midp = mid.data(); |
| RealScalar* distp = distinct.data(); |
| Index nd = 0; |
| distp[nd++] = midp[0]; |
| for (Index i = 1; i < m; ++i) |
| if (midp[i] != midp[i - 1]) distp[nd++] = midp[i]; |
| tridiagonal_sturm_counts<RealScalar>(alpha, beta_sq, n, pivmin, distp, counts_distinct.data(), nd); |
| const RealScalar* cdp = counts_distinct.data(); |
| RealScalar* cp = counts.data(); |
| Index g = 0; |
| cp[0] = cdp[g]; |
| for (Index i = 1; i < m; ++i) { |
| if (midp[i] != midp[i - 1]) ++g; |
| cp[i] = cdp[g]; |
| } |
| if (nd == m) deduplicate = false; // all distinct: finer brackets stay distinct |
| } else { |
| tridiagonal_sturm_counts<RealScalar>(alpha, beta_sq, n, pivmin, mid.data(), counts.data(), m); |
| } |
| // count(mid) <= target => eigenvalue is >= mid, raise the lower bound; |
| // otherwise the eigenvalue is < mid, so lower the upper bound. |
| const auto raise = (counts <= targets); |
| lower = raise.select(mid, lower); |
| upper = raise.select(upper, mid); |
| const ArrayType new_mid = RealScalar(0.5) * (lower + upper); |
| // Freeze each eigenvalue at the midpoint of its bracket the first time that bracket is tight |
| // enough (width within abs_tol) or stops moving. Already-frozen entries keep their value. |
| const auto converged = (new_mid == mid) || ((upper - lower) <= abs_tol); |
| result = (!done && converged).select(new_mid, result); |
| done = done || converged; |
| mid = new_mid; |
| if (done.all()) break; |
| } |
| // Any eigenvalue that did not converge within max_iters keeps its last midpoint, written straight |
| // into the caller's output (no aliasing temporary: select is coefficient-wise and out is disjoint). |
| Eigen::Map<ArrayType>(out, m) = done.select(result, mid); |
| } |
| |
| /** \internal |
| * |
| * Returns the number of eigenvalues of the (normalized) symmetric tridiagonal matrix T that are |
| * strictly less than \c x, evaluated by the same pivot recurrence as tridiagonal_sturm_counts() but |
| * counting only strictly-negative pivots. This translates the endpoints of a half-open value range |
| * [vl, vu) into eigenvalue indices [count(vl), count(vu)): counting strictly-below makes an |
| * eigenvalue that lands exactly on an endpoint count as not-below, so it is kept at the closed lower |
| * end and dropped at the open upper end. (The batch evaluator above instead counts a pivot at the |
| * floor as below, which on its own would yield (vl, vu]; the two differ only when an endpoint |
| * coincides with an eigenvalue to within the pivot floor \c pivmin, where the choice is anyway |
| * numerically ambiguous.) |
| * |
| * As in the batch evaluator, a pivot whose magnitude reaches the floor \c pivmin is replaced by |
| * +/- pivmin, keeping its sign, so the following division cannot overflow. |
| * |
| * \param[in] alpha normalized diagonal of T (length \c n). |
| * \param[in] beta_sq normalized squared off-diagonal (length \c n-1). |
| * \param[in] n dimension of T. |
| * \param[in] pivmin smallest allowed pivot magnitude (see tridiagonal_sturm_counts()). |
| * \param[in] x the shift at which to count. |
| */ |
| template <typename RealScalar> |
| Index tridiagonal_sturm_count_below(const RealScalar* alpha, const RealScalar* beta_sq, Index n, RealScalar pivmin, |
| RealScalar x) { |
| RealScalar q = alpha[0] - x; |
| Index count = (q < RealScalar(0)) ? 1 : 0; |
| if (numext::abs(q) < pivmin) q = numext::copysign(pivmin, q); |
| for (Index i = 1; i < n; ++i) { |
| q = (alpha[i] - beta_sq[i - 1] / q) - x; |
| if (q < RealScalar(0)) ++count; |
| if (numext::abs(q) < pivmin) q = numext::copysign(pivmin, q); |
| } |
| return count; |
| } |
| |
| /** \internal |
| * |
| * Computes a subset of the eigenvalues of a real symmetric tridiagonal matrix T |
| * by Sturm-sequence spectral bisection (cf. LAPACK's xSTEBZ), accelerated by the |
| * vectorized batch evaluator tridiagonal_sturm_counts(). |
| * |
| * The eigenvalues are bracketed using the Gershgorin disc theorem, then refined |
| * by running an independent bisection per requested eigenvalue, with all active |
| * brackets evaluated together in each step so the packet evaluator stays full. |
| * |
| * \param[in] diag diagonal of T (length \c n). |
| * \param[in] subdiag sub-diagonal of T (length \c n-1). |
| * \param[in] range which eigenvalues to compute (see EigenvalueRange). |
| * \param[in] abs_tol requested absolute accuracy; the effective tolerance is |
| * max(abs_tol, eps * ||T||). Pass 0 for full precision. |
| * \param[out] eivalues filled with the selected eigenvalues in non-decreasing |
| * order; resized to the number of selected eigenvalues. |
| * \returns the number of eigenvalues written to \c eivalues. |
| */ |
| template <typename DiagType, typename SubdiagType, typename EivalType> |
| Index tridiagonal_bisection(const DiagType& diag, const SubdiagType& subdiag, const EigenvalueRange& range, |
| typename DiagType::Scalar abs_tol, EivalType& eivalues) { |
| typedef typename DiagType::Scalar RealScalar; |
| typedef Array<RealScalar, Dynamic, 1> ArrayType; |
| EIGEN_STATIC_ASSERT(NumTraits<RealScalar>::IsInteger == 0 && NumTraits<RealScalar>::IsComplex == 0, |
| THIS_FUNCTION_IS_NOT_FOR_INTEGER_OR_COMPLEX_TYPES) |
| |
| const Index n = diag.size(); |
| if (n == 0) { |
| eivalues.derived().resize(0); |
| return 0; |
| } |
| |
| // Normalize the matrix to O(1) to avoid overflow/underflow when squaring the |
| // off-diagonal and during the Sturm recurrence; eigenvalues scale linearly, so the |
| // scaling is undone at the very end. (The caller has already verified the input is |
| // finite.) This mirrors the uniform scaling done in SelfAdjointEigenSolver::compute(). |
| // Divide each entry directly by the scale rather than multiplying by its reciprocal: when the scale |
| // is subnormal, 1/scale overflows to infinity, the normalization silently disables itself, and the |
| // Sturm recurrence underflows and returns wrong eigenvalues. |
| RealScalar scale = diag.cwiseAbs().maxCoeff(); |
| if (n >= 2) scale = numext::maxi(scale, subdiag.cwiseAbs().maxCoeff()); |
| if (numext::is_exactly_zero(scale)) scale = RealScalar(1); |
| |
| // Local contiguous copies of the scaled matrix data, |off-diagonal|, and its square. |
| const ArrayType alpha = diag.array() / scale; |
| const ArrayType beta_abs = (n >= 2) ? ArrayType(subdiag.array().abs() / scale) : ArrayType(0); |
| const ArrayType beta_sq = (n >= 2) ? ArrayType(beta_abs.square()) : ArrayType(0); |
| |
| // Smallest pivot allowed during the Sturm recurrence (positive, floored so |
| // that a matrix with an all-zero off-diagonal still has pivmin > 0). |
| const RealScalar eps = NumTraits<RealScalar>::epsilon(); |
| const RealScalar safemin = numext::maxi(RealScalar(1) / NumTraits<RealScalar>::highest(), |
| (RealScalar(1) + eps) * (std::numeric_limits<RealScalar>::min)()); |
| const RealScalar max_beta_sq = (n >= 2) ? beta_sq.maxCoeff() : RealScalar(0); |
| const RealScalar pivmin = safemin * numext::maxi(max_beta_sq, RealScalar(1)); |
| |
| // Gershgorin bounds: row k has radius |beta_{k-1}| + |beta_k| (with the |
| // missing boundary off-diagonals taken as zero). |
| ArrayType radius(n); |
| if (n == 1) { |
| radius(0) = RealScalar(0); |
| } else { |
| radius(0) = beta_abs(0); |
| radius(n - 1) = beta_abs(n - 2); |
| if (n > 2) radius.segment(1, n - 2) = beta_abs.head(n - 2) + beta_abs.segment(1, n - 2); |
| } |
| RealScalar lambda_min = (alpha - radius).minCoeff(); |
| RealScalar lambda_max = (alpha + radius).maxCoeff(); |
| |
| // Effective tolerance and outward expansion of the bracket so that |
| // count(lambda_min) == 0 and count(lambda_max) == n (cf. LAPACK xSTEBZ). |
| const RealScalar tnorm = numext::maxi(numext::abs(lambda_min), numext::abs(lambda_max)); |
| // Convergence tolerance: refine each bracket to ~1 ulp of ||T||. This is the role of xSTEBZ's |
| // relative tolerance (it stops once b - a < RELFAC * ulp * max(|a|, |b|), RELFAC = 2), here |
| // specialized to the matrix norm tnorm and folded together with any absolute tolerance the caller |
| // requested. |
| abs_tol = numext::maxi(abs_tol, eps * tnorm); |
| // Widen the Gershgorin bracket so that, despite rounding in the Sturm recurrence, count() really does |
| // reach 0 at lambda_min and n at lambda_max. The n*eps*tnorm term bounds the worst-case count error |
| // accumulated over the n recurrence steps; the 2*pivmin term covers the pivot floor. The 2.1 prefactor |
| // is xSTEBZ's FUDGE factor: ideally 1 would suffice, but it is taken slightly larger to stay robust on |
| // sloppy arithmetic, and (per xSTEBZ) widening the bracket this way only loosens the initial search |
| // interval -- it has no effect on the accuracy of the converged eigenvalues. |
| const RealScalar expand = RealScalar(2.1) * (RealScalar(n) * eps * tnorm + RealScalar(2) * pivmin); |
| lambda_min -= expand; |
| lambda_max += expand; |
| |
| // Determine the target indices [t_lo, t_hi) and the search bracket. |
| Index t_lo = 0, t_hi = n; |
| RealScalar bracket_lo = lambda_min, bracket_hi = lambda_max; |
| if (range.kind == EigenvalueRange::ByIndex) { |
| eigen_assert(range.il >= 0 && range.il <= range.iu && range.iu <= n && "invalid eigenvalue index range"); |
| t_lo = range.il; |
| t_hi = range.iu; |
| } else if (range.kind == EigenvalueRange::ByValue) { |
| eigen_assert(range.vl <= range.vu && "invalid eigenvalue value range"); |
| const RealScalar vl = RealScalar(range.vl) / scale; |
| const RealScalar vu = RealScalar(range.vu) / scale; |
| // The eigenvalues in [vl, vu) are exactly those with indices in [t_lo, t_hi), where t_lo and t_hi |
| // count the eigenvalues strictly below each endpoint. Counting strictly-below puts an eigenvalue |
| // that coincides with an endpoint inside the interval at the closed lower end and outside it at the |
| // open upper end (see tridiagonal_sturm_count_below()). |
| t_lo = tridiagonal_sturm_count_below<RealScalar>(alpha.data(), beta_sq.data(), n, pivmin, vl); |
| t_hi = tridiagonal_sturm_count_below<RealScalar>(alpha.data(), beta_sq.data(), n, pivmin, vu); |
| bracket_lo = numext::maxi(lambda_min, vl); |
| bracket_hi = numext::mini(lambda_max, vu); |
| } |
| |
| const Index m = t_hi - t_lo; |
| eivalues.derived().resize(m); |
| if (m <= 0) return 0; |
| |
| const int max_iters = NumTraits<RealScalar>::digits() + 2; |
| ArrayType mid_all(m); |
| RealScalar* out = mid_all.data(); |
| const RealScalar* alpha_p = alpha.data(); |
| const RealScalar* beta_sq_p = beta_sq.data(); |
| |
| // The m eigenvalues are bisected independently, so the spectrum splits cleanly across threads with |
| // a single fork/join: thread t owns the contiguous index block [t_lo + lo, t_lo + hi) and writes |
| // its converged midpoints into the disjoint slice out[lo, hi). No communication until the join. |
| #if defined(EIGEN_HAS_OPENMP) |
| int nthreads = 1; |
| // Don't nest inside an existing parallel region, and only fork when there is enough work to |
| // amortize the thread overhead while still leaving each thread a SIMD-friendly chunk of points. |
| if (omp_get_num_threads() == 1) { |
| constexpr Index kPacketSize = Index(unpacket_traits<typename packet_traits<RealScalar>::type>::size); |
| // One work unit ~ one Sturm step (a packet division); kMinTaskSize is the minimum per thread. |
| const double work = m * n * max_iters; |
| const double kMinTaskSize = 131072.0; |
| const Index work_threads = Index(work / kMinTaskSize); |
| const Index point_threads = m / (8 * kPacketSize); |
| const Index pb = numext::maxi(Index(1), numext::mini(work_threads, point_threads)); |
| nthreads = int(numext::mini(pb, Index(Eigen::nbThreads()))); |
| } |
| if (nthreads > 1) { |
| #pragma omp parallel num_threads(nthreads) |
| { |
| const Index nt = omp_get_num_threads(); |
| const Index tid = omp_get_thread_num(); |
| // Balanced split: every thread gets floor(m/nt) or ceil(m/nt) consecutive indices, none empty. |
| const Index lo = tid * m / nt; |
| const Index hi = (tid + 1) * m / nt; |
| tridiagonal_bisection_block<RealScalar>(alpha_p, beta_sq_p, n, pivmin, bracket_lo, bracket_hi, t_lo + lo, |
| t_lo + hi, max_iters, abs_tol, out + lo); |
| } |
| } else |
| #endif |
| { |
| tridiagonal_bisection_block<RealScalar>(alpha_p, beta_sq_p, n, pivmin, bracket_lo, bracket_hi, t_lo, t_hi, |
| max_iters, abs_tol, out); |
| } |
| |
| // Undo the normalization. |
| eivalues = (mid_all * scale).matrix(); |
| return m; |
| } |
| |
| } // namespace internal |
| } // namespace Eigen |
| |
| #endif // EIGEN_TRIDIAGONAL_BISECTION_H |