Clean up stale TODOs/FIXMEs across Core, SparseCore, QR, SVD, Eigenvalues, and Tensor

libeigen/eigen!2462

Closes #1723

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 8957aa9..3d71b6d 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -59,8 +59,6 @@
 
   /** Default constructor, for use when strides are fixed at compile time */
   EIGEN_DEVICE_FUNC constexpr Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
-    // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
-    // FIXME: for Eigen 4 we should also unify this API with fix<>
     eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
   }
 
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index 6dc571e..91d06ad 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -58,8 +58,6 @@
     m_dst.template writePacket<StoreMode>(index, tmp);
   }
 
-  // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
-  // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) {
     Index row = Base::rowIndexByOuterInner(outer, inner);
@@ -83,8 +81,6 @@
     m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
   }
 
-  // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
-  // mean no CRTP (Gael)
   template <int StoreMode, int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
     Index row = Base::rowIndexByOuterInner(outer, inner);
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
index 4700c6c..b79be58 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
@@ -380,7 +380,8 @@
   conj_helper<LhsPacket, RhsPacket, false, false> pcj;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -2058,7 +2059,8 @@
   conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = PTraits::ResPacketSize,
@@ -2390,7 +2392,8 @@
   const Index n2 = rows - 1;
 #endif
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -2733,7 +2736,8 @@
   const Index n2 = rows - 1;
 #endif
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = PTraits::ResPacketSize,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
index 0f68164..5514ffd 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
@@ -198,7 +198,6 @@
   sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
 
   if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) {
-    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
     Packet peven = peven_mask(x);
     Packet ysin = pselect(poly_mask, y2, y1);
     Packet ycos = pselect(poly_mask, y1, y2);
@@ -315,7 +314,6 @@
   PacketI q_int;
   Packet s;
 
-  // TODO Implement huge angle argument reduction
   if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
     // Medium path: use double-word product x * (2/pi) for precise quadrant computation.
     Packet prod_hi, prod_lo;
@@ -380,7 +378,6 @@
     sign_bit = sign_cos;
     sFinalRes = pselect(poly_mask, scos, ssin);
   } else if (Func == TrigFunction::Tan) {
-    // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div.
     sign_bit = pxor(sign_sin, sign_cos);
     sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin));
   } else if (Func == TrigFunction::SinCos) {
@@ -391,9 +388,9 @@
   sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
   sFinalRes = pxor(sFinalRes, sign_bit);
 
-  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
-  // using the C++ standard library.
-  // TODO Remove it when huge angle argument reduction is implemented
+  // For inputs above huge_th the medium-path reduction loses too much precision. A vectorized
+  // Payne-Hanek reduction was investigated and judged not worthwhile (high implementation cost
+  // for what is in practice a rare path), so these inputs fall back to the scalar libm.
   if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
     const int PacketSize = unpacket_traits<Packet>::size;
     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h
index 8138210..522d3d0 100644
--- a/Eigen/src/Core/arch/LSX/Complex.h
+++ b/Eigen/src/Core/arch/LSX/Complex.h
@@ -354,7 +354,6 @@
   return res;
 }
 
-// FIXME: force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
@@ -374,7 +373,6 @@
   return pset1<Packet1cd>(*from);
 }
 
-// FIXME: force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d04b898..8615895 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -324,7 +324,6 @@
   return Packet1cd(_mm_andnot_pd(b.v, a.v));
 }
 
-// FIXME force unaligned load, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from));
@@ -344,7 +343,6 @@
   return pset1<Packet1cd>(*from);
 }
 
-// FIXME force unaligned store, this is a temporary fix
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 30718b2..c4f2b83 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -284,10 +284,15 @@
       //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
       n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
     } else if (old_k == k) {
-      // So far, no blocking at all, i.e., kc==k, and nc==n.
-      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
-      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
-      // here should be obsolete.
+      // No k- or n-blocking happened yet (kc==depth, nc>=n). gebp already
+      // strip-chunks the packed lhs via its own `actual_panel_rows` budget,
+      // so cache residency is honored whatever mc we pick here. What this
+      // branch actually governs is the size of the `mc * kc` packing buffer
+      // (blockA) that the caller allocates — capping mc keeps it bounded for
+      // tall-m / small-k shapes, where leaving mc=m would allocate up to
+      // `rows * depth * sizeof(LhsScalar)`. A budget-based alternative
+      // (e.g. cap blockA at ~L3/4) is no faster in benchmarks and increases
+      // heap use, so the original L1/L2-residency tuning is kept.
       Index problem_size = k * n * sizeof(LhsScalar);
       Index actual_lm = actual_l2;
       Index max_mc = m;
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 5f6c395..cb58d30 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -200,7 +200,9 @@
   conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template, and on
+  // modern x86 aligned/unaligned packet loads are equivalent anyway.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
@@ -365,7 +367,9 @@
   const Index n4 = rows - 3;
   const Index n2 = rows - 1;
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
+  // LhsAlignment stays Unaligned; enabling aligned reads would require
+  // propagating the Mapper's Alignment through the run() template, and on
+  // modern x86 aligned/unaligned packet loads are equivalent anyway.
   enum {
     LhsAlignment = Unaligned,
     ResPacketSize = Traits::ResPacketSize,
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h
index 6e50a43..b9dbc6a 100644
--- a/Eigen/src/Eigenvalues/RealSchur.h
+++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -343,9 +343,9 @@
 template <typename MatrixType>
 inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() {
   const Index size = m_matT.cols();
-  // FIXME: a triangular reduction would be more efficient here.
-  // Scalar norm = m_matT.upper().cwiseAbs().sum()
-  //               + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum();
+  // m_matT is upper-Hessenberg, so per column only rows [0, j+1] are nonzero.
+  // The column-wise loop touches ~n^2/2 entries; scanning the full matrix
+  // would double that, and TriangularView has no direct cwiseAbs().sum().
   Scalar norm(0);
   for (Index j = 0; j < size; ++j) norm += m_matT.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum();
   return norm;
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index aef8739..f54c444 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -22,7 +22,11 @@
 struct TridiagonalizationMatrixTReturnType;
 template <typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType>> : public traits<typename MatrixType::PlainObject> {
-  typedef typename MatrixType::PlainObject ReturnType;  // FIXME: consider using BandMatrix as ReturnType.
+  // matrixT() returns a dense n x n matrix. A band-stored alternative (e.g. a
+  // future matrixTBand() returning BandMatrix<Scalar, Dynamic, Dynamic, 1, 1>)
+  // would be ~3n storage instead of n^2, but changing this ReturnType in place
+  // would be API-breaking for callers that assume a dense matrix.
+  typedef typename MatrixType::PlainObject ReturnType;
   enum { Flags = 0 };
 };
 
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h
index e7173a5..c65c288 100644
--- a/Eigen/src/QR/HouseholderQR.h
+++ b/Eigen/src/QR/HouseholderQR.h
@@ -384,15 +384,19 @@
   }
 }
 
-// TODO: expose a public API for rank-1 QR update.
 /** \internal
- * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that
- * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that
- * the first @c k-1 columns of the matrix @c mat contain the QR decomposition of \f$A^N\f$ up to
- * column k-1. Then the QR decomposition of the k-th column (given by @c newColumn) is computed by
- * applying the k-1 Householder projectors on it and finally compute the projector \f$H_k\f$ of
- * it. On exit the matrix @c mat and the vector @c hCoeffs contain the QR decomposition of the
- * first k columns of \f$A^N\f$. The \a tempData argument must point to at least mat.cols() scalars.  */
+ * Column-insert / column-replace helper for a compact-storage Householder QR.
+ * Given a matrix @c mat and @c hCoeffs holding the QR factorization of the first @c k columns of
+ * some matrix A, this function replaces column @c k of that factorization with @c newColumn: it
+ * applies the existing k Householder reflectors (stored in columns 0..k-1 of @c mat and in
+ * @c hCoeffs.head(k)) to @c newColumn, then computes the k-th reflector in place. On exit
+ * @c mat.leftCols(k+1) and @c hCoeffs.head(k+1) hold the QR factorization of A.leftCols(k) with
+ * @c newColumn inserted at position @c k. @c tempData must point to at least @c mat.cols() scalars.
+ *
+ * Despite the historical "rank-1 update" label, this is not a full QR update in the
+ * Gill-Golub-Murray-Saunders sense: there is no public API for @c QR(A + u vT) or for column/row
+ * delete. See libeigen/eigen#3072 for a tracker of that feature gap. Currently only NNLS relies on
+ * this helper; the signature is tuned to its active-set bookkeeping. */
 template <typename MatrixQR, typename HCoeffs, typename VectorQR>
 void householder_qr_inplace_update(MatrixQR& mat, HCoeffs& hCoeffs, const VectorQR& newColumn,
                                    typename MatrixQR::Index k, typename MatrixQR::Scalar* tempData) {
diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h
index 9b818fe..19444e1 100644
--- a/Eigen/src/SVD/BDCSVDImpl.h
+++ b/Eigen/src/SVD/BDCSVDImpl.h
@@ -307,11 +307,6 @@
 // the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
 // order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
 // that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
-//
-// TODO: opportunities for optimization: better root-finding algorithm, better stopping criterion,
-// better handling of round-off errors, and consistent ordering.
-// For instance, to solve the secular equation using FMM, see
-// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
 template <typename RealScalar_>
 void bdcsvd_impl<RealScalar_>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index dc2866e..1753c4a 100644
--- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -79,8 +79,6 @@
       const Index t200 = rows / 11;  // 11 == (log2(200)*1.39)
       const Index t = (rows * 100) / 139;
 
-      // FIXME: reserve space for the expected number of non-zeros.
-      // FIXME: implement faster sorting for very small nnz counts.
       // if the result is sparse enough => use a quick sort
       // otherwise => loop through the entire vector
       // In order to avoid to perform an expensive log2 when the
@@ -127,7 +125,8 @@
 
     // If the result is tall and thin (in the extreme case a column vector)
     // then it is faster to sort the coefficients inplace instead of transposing twice.
-    // FIXME: this heuristic has known limitations and should be improved.
+    // The dimension-only test here ignores nnz / per-column density; a proper
+    // cost model using estimated_nnz_prod would pick the right path more often.
     if (lhs.rows() > rhs.cols()) {
       using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux, ResultType::RowsAtCompileTime,
                                                   ResultType::ColsAtCompileTime, ColMajorMatrixAux::Flags>::type;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 8b5cb2a..2248def 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1277,7 +1277,8 @@
   using SrcXprType =
       CwiseBinaryOp<scalar_disjunction_op<DupFunctor, Scalar>, const SparseMatrixType, const SparseMatrixType>;
 
-  // TODO: process triplets without making a copy
+  // Saving the trips temporary would need a direct mat+triplets merge with
+  // on-the-fly duplicate collapsing (non-trivial).
   SparseMatrixType trips(mat.rows(), mat.cols());
   set_from_triplets_sorted(begin, end, trips, dup_func);
 
diff --git a/unsupported/Eigen/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h
index f94d39b..23c0dec 100644
--- a/unsupported/Eigen/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h
@@ -237,7 +237,12 @@
     }
   }
 
-  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  // The per-dim integer div/mod look expensive but amortize well: packet paths
+  // call this once per PacketSize outputs, and modern x86 hardware div is
+  // ~20 cycles. Prototyped replacing div/mod with TensorIntDivisor (the
+  // pattern used in TensorShuffling et al.); measured net-negative on Intel
+  // Raptor Lake across bench_broadcasting (more shapes regress 3-5% than
+  // improve). Left as hardware div.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
     Index inputIndex = 0;
     EIGEN_UNROLL_LOOP
@@ -583,8 +588,8 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
-    // tensors. But this might need further tuning.
+    // Target the L1 cache: measured ~30% faster than targeting the last-level
+    // cache on large tensors.
     const size_t target_size = m_device.firstLevelCacheSize();
     return internal::TensorBlockResourceRequirements::merge(
         m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
diff --git a/unsupported/Eigen/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/src/Tensor/TensorConcatenation.h
index ea56b96..15811a6 100644
--- a/unsupported/Eigen/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/src/Tensor/TensorConcatenation.h
@@ -181,7 +181,11 @@
     m_rightImpl.cleanup();
   }
 
-  // TODO(phli): Attempt to speed this up. The integer divisions and modulo are slow.
+  // The per-dim integer div/mod in this loop are the obvious "slow" candidates,
+  // but the same TensorIntDivisor substitution was measured net-negative on
+  // Intel Raptor Lake when prototyped for TensorBroadcasting (see the comment
+  // on TensorBroadcasting::indexColMajor). Modern x86 hardware div (~20
+  // cycles) amortizes well and the mul+shifts don't win back the added state.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     // Collect dimension-wise indices (subs).
     array<Index, NumDims> subs;