Clean up stale TODOs/FIXMEs across Core, SparseCore, QR, SVD, Eigenvalues, and Tensor libeigen/eigen!2462 Closes #1723 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 8957aa9..3d71b6d 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h
@@ -59,8 +59,6 @@ /** Default constructor, for use when strides are fixed at compile time */ EIGEN_DEVICE_FUNC constexpr Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) { - // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic. - // FIXME: for Eigen 4 we should also unify this API with fix<> eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic); }
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index 6dc571e..91d06ad 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h
@@ -58,8 +58,6 @@ m_dst.template writePacket<StoreMode>(index, tmp); } - // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I - // mean no CRTP (Gael) template <int StoreMode, int LoadMode, typename PacketType> EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); @@ -83,8 +81,6 @@ m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count); } - // TODO: find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I - // mean no CRTP (Gael) template <int StoreMode, int LoadMode, typename PacketType> EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) { Index row = Base::rowIndexByOuterInner(outer, inner);
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc index 4700c6c..b79be58 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc +++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
@@ -380,7 +380,8 @@ conj_helper<LhsPacket, RhsPacket, false, false> pcj; const Index lhsStride = lhs.stride(); - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template. enum { LhsAlignment = Unaligned, ResPacketSize = Traits::ResPacketSize, @@ -2058,7 +2059,8 @@ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj; const Index lhsStride = lhs.stride(); - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template. enum { LhsAlignment = Unaligned, ResPacketSize = PTraits::ResPacketSize, @@ -2390,7 +2392,8 @@ const Index n2 = rows - 1; #endif - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template. enum { LhsAlignment = Unaligned, ResPacketSize = Traits::ResPacketSize, @@ -2733,7 +2736,8 @@ const Index n2 = rows - 1; #endif - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template. enum { LhsAlignment = Unaligned, ResPacketSize = PTraits::ResPacketSize,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h index 0f68164..5514ffd 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathTrig.h
@@ -198,7 +198,6 @@ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit if ((Func == TrigFunction::SinCos) || (Func == TrigFunction::Tan)) { - // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div. Packet peven = peven_mask(x); Packet ysin = pselect(poly_mask, y2, y1); Packet ycos = pselect(poly_mask, y1, y2); @@ -315,7 +314,6 @@ PacketI q_int; Packet s; - // TODO Implement huge angle argument reduction if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) { // Medium path: use double-word product x * (2/pi) for precise quadrant computation. Packet prod_hi, prod_lo; @@ -380,7 +378,6 @@ sign_bit = sign_cos; sFinalRes = pselect(poly_mask, scos, ssin); } else if (Func == TrigFunction::Tan) { - // TODO(rmlarsen): Add single polynomial for tan(x) instead of paying for sin+cos+div. sign_bit = pxor(sign_sin, sign_cos); sFinalRes = pdiv(pselect(poly_mask, ssin, scos), pselect(poly_mask, scos, ssin)); } else if (Func == TrigFunction::SinCos) { @@ -391,9 +388,9 @@ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit sFinalRes = pxor(sFinalRes, sign_bit); - // If the inputs values are higher than that a value that the argument reduction can currently address, compute them - // using the C++ standard library. - // TODO Remove it when huge angle argument reduction is implemented + // For inputs above huge_th the medium-path reduction loses too much precision. A vectorized + // Payne-Hanek reduction was investigated and judged not worthwhile (high implementation cost + // for what is in practice a rare path), so these inputs fall back to the scalar libm. if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) { const int PacketSize = unpacket_traits<Packet>::size; EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
diff --git a/Eigen/src/Core/arch/LSX/Complex.h b/Eigen/src/Core/arch/LSX/Complex.h index 8138210..522d3d0 100644 --- a/Eigen/src/Core/arch/LSX/Complex.h +++ b/Eigen/src/Core/arch/LSX/Complex.h
@@ -354,7 +354,6 @@ return res; } -// FIXME: force unaligned load, this is a temporary fix template <> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); @@ -374,7 +373,6 @@ return pset1<Packet1cd>(*from); } -// FIXME: force unaligned store, this is a temporary fix template <> EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index d04b898..8615895 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -324,7 +324,6 @@ return Packet1cd(_mm_andnot_pd(b.v, a.v)); } -// FIXME force unaligned load, this is a temporary fix template <> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); @@ -344,7 +343,6 @@ return pset1<Packet1cd>(*from); } -// FIXME force unaligned store, this is a temporary fix template <> EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 30718b2..c4f2b83 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -284,10 +284,15 @@ // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1)))); } else if (old_k == k) { - // So far, no blocking at all, i.e., kc==k, and nc==n. - // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 - // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic - // here should be obsolete. + // No k- or n-blocking happened yet (kc==depth, nc>=n). gebp already + // strip-chunks the packed lhs via its own `actual_panel_rows` budget, + // so cache residency is honored whatever mc we pick here. What this + // branch actually governs is the size of the `mc * kc` packing buffer + // (blockA) that the caller allocates — capping mc keeps it bounded for + // tall-m / small-k shapes, where leaving mc=m would allocate up to + // `rows * depth * sizeof(LhsScalar)`. A budget-based alternative + // (e.g. cap blockA at ~L3/4) is no faster in benchmarks and increases + // heap use, so the original L1/L2-residency tuning is kept. Index problem_size = k * n * sizeof(LhsScalar); Index actual_lm = actual_l2; Index max_mc = m;
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 5f6c395..cb58d30 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -200,7 +200,9 @@ conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter; const Index lhsStride = lhs.stride(); - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template, and on + // modern x86 aligned/unaligned packet loads are equivalent anyway. enum { LhsAlignment = Unaligned, ResPacketSize = Traits::ResPacketSize, @@ -365,7 +367,9 @@ const Index n4 = rows - 3; const Index n2 = rows - 1; - // TODO: for padded aligned inputs, we could enable aligned reads + // LhsAlignment stays Unaligned; enabling aligned reads would require + // propagating the Mapper's Alignment through the run() template, and on + // modern x86 aligned/unaligned packet loads are equivalent anyway. enum { LhsAlignment = Unaligned, ResPacketSize = Traits::ResPacketSize,
diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 6e50a43..b9dbc6a 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h
@@ -343,9 +343,9 @@ template <typename MatrixType> inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() { const Index size = m_matT.cols(); - // FIXME: a triangular reduction would be more efficient here. - // Scalar norm = m_matT.upper().cwiseAbs().sum() - // + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum(); + // m_matT is upper-Hessenberg, so per column only rows [0, j+1] are nonzero. + // The column-wise loop touches ~n^2/2 entries; scanning the full matrix + // would double that, and TriangularView has no direct cwiseAbs().sum(). Scalar norm(0); for (Index j = 0; j < size; ++j) norm += m_matT.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum(); return norm;
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index aef8739..f54c444 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -22,7 +22,11 @@ struct TridiagonalizationMatrixTReturnType; template <typename MatrixType> struct traits<TridiagonalizationMatrixTReturnType<MatrixType>> : public traits<typename MatrixType::PlainObject> { - typedef typename MatrixType::PlainObject ReturnType; // FIXME: consider using BandMatrix as ReturnType. + // matrixT() returns a dense n x n matrix. A band-stored alternative (e.g. a + // future matrixTBand() returning BandMatrix<Scalar, Dynamic, Dynamic, 1, 1>) + // would be ~3n storage instead of n^2, but changing this ReturnType in place + // would be API-breaking for callers that assume a dense matrix. + typedef typename MatrixType::PlainObject ReturnType; enum { Flags = 0 }; };
diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index e7173a5..c65c288 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h
@@ -384,15 +384,19 @@ } } -// TODO: expose a public API for rank-1 QR update. /** \internal - * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that - * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that - * the first @c k-1 columns of the matrix @c mat contain the QR decomposition of \f$A^N\f$ up to - * column k-1. Then the QR decomposition of the k-th column (given by @c newColumn) is computed by - * applying the k-1 Householder projectors on it and finally compute the projector \f$H_k\f$ of - * it. On exit the matrix @c mat and the vector @c hCoeffs contain the QR decomposition of the - * first k columns of \f$A^N\f$. The \a tempData argument must point to at least mat.cols() scalars. */ + * Column-insert / column-replace helper for a compact-storage Householder QR. + * Given a matrix @c mat and @c hCoeffs holding the QR factorization of the first @c k columns of + * some matrix A, this function replaces column @c k of that factorization with @c newColumn: it + * applies the existing k Householder reflectors (stored in columns 0..k-1 of @c mat and in + * @c hCoeffs.head(k)) to @c newColumn, then computes the k-th reflector in place. On exit + * @c mat.leftCols(k+1) and @c hCoeffs.head(k+1) hold the QR factorization of A.leftCols(k) with + * @c newColumn inserted at position @c k. @c tempData must point to at least @c mat.cols() scalars. + * + * Despite the historical "rank-1 update" label, this is not a full QR update in the + * Gill-Golub-Murray-Saunders sense: there is no public API for @c QR(A + u vT) or for column/row + * delete. See libeigen/eigen#3072 for a tracker of that feature gap. Currently only NNLS relies on + * this helper; the signature is tuned to its active-set bookkeeping. */ template <typename MatrixQR, typename HCoeffs, typename VectorQR> void householder_qr_inplace_update(MatrixQR& mat, HCoeffs& hCoeffs, const VectorQR& newColumn, typename MatrixQR::Index k, typename MatrixQR::Scalar* tempData) {
diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h index 9b818fe..19444e1 100644 --- a/Eigen/src/SVD/BDCSVDImpl.h +++ b/Eigen/src/SVD/BDCSVDImpl.h
@@ -307,11 +307,6 @@ // the first column and on the diagonal and has undergone deflation, so diagonal is in increasing // order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except // that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order. -// -// TODO: opportunities for optimization: better root-finding algorithm, better stopping criterion, -// better handling of round-off errors, and consistent ordering. -// For instance, to solve the secular equation using FMM, see -// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template <typename RealScalar_> void bdcsvd_impl<RealScalar_>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index dc2866e..1753c4a 100644 --- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -79,8 +79,6 @@ const Index t200 = rows / 11; // 11 == (log2(200)*1.39) const Index t = (rows * 100) / 139; - // FIXME: reserve space for the expected number of non-zeros. - // FIXME: implement faster sorting for very small nnz counts. // if the result is sparse enough => use a quick sort // otherwise => loop through the entire vector // In order to avoid to perform an expensive log2 when the @@ -127,7 +125,8 @@ // If the result is tall and thin (in the extreme case a column vector) // then it is faster to sort the coefficients inplace instead of transposing twice. - // FIXME: this heuristic has known limitations and should be improved. + // The dimension-only test here ignores nnz / per-column density; a proper + // cost model using estimated_nnz_prod would pick the right path more often. if (lhs.rows() > rhs.cols()) { using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux, ResultType::RowsAtCompileTime, ResultType::ColsAtCompileTime, ColMajorMatrixAux::Flags>::type;
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 8b5cb2a..2248def 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -1277,7 +1277,8 @@ using SrcXprType = CwiseBinaryOp<scalar_disjunction_op<DupFunctor, Scalar>, const SparseMatrixType, const SparseMatrixType>; - // TODO: process triplets without making a copy + // Saving the trips temporary would need a direct mat+triplets merge with + // on-the-fly duplicate collapsing (non-trivial). SparseMatrixType trips(mat.rows(), mat.cols()); set_from_triplets_sorted(begin, end, trips, dup_func);
diff --git a/unsupported/Eigen/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h index f94d39b..23c0dec 100644 --- a/unsupported/Eigen/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/src/Tensor/TensorBroadcasting.h
@@ -237,7 +237,12 @@ } } - // TODO: attempt to speed this up. The integer divisions and modulo are slow + // The per-dim integer div/mod look expensive but amortize well: packet paths + // call this once per PacketSize outputs, and modern x86 hardware div is + // ~20 cycles. Prototyped replacing div/mod with TensorIntDivisor (the + // pattern used in TensorShuffling et al.); measured net-negative on Intel + // Raptor Lake across bench_broadcasting (more shapes regress 3-5% than + // improve). Left as hardware div. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { Index inputIndex = 0; EIGEN_UNROLL_LOOP @@ -583,8 +588,8 @@ } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { - // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large - // tensors. But this might need further tuning. + // Target the L1 cache: measured ~30% faster than targeting the last-level + // cache on large tensors. const size_t target_size = m_device.firstLevelCacheSize(); return internal::TensorBlockResourceRequirements::merge( m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
diff --git a/unsupported/Eigen/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/src/Tensor/TensorConcatenation.h index ea56b96..15811a6 100644 --- a/unsupported/Eigen/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/src/Tensor/TensorConcatenation.h
@@ -181,7 +181,11 @@ m_rightImpl.cleanup(); } - // TODO(phli): Attempt to speed this up. The integer divisions and modulo are slow. + // The per-dim integer div/mod in this loop are the obvious "slow" candidates, + // but the same TensorIntDivisor substitution was measured net-negative on + // Intel Raptor Lake when prototyped for TensorBroadcasting (see the comment + // on TensorBroadcasting::indexColMajor). Modern x86 hardware div (~20 + // cycles) amortizes well and the mul+shifts don't win back the added state. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { // Collect dimension-wise indices (subs). array<Index, NumDims> subs;