BDCSVD: eliminate temporaries flagged by long-standing FIXMEs

libeigen/eigen!2450

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 35c0a92..2a8499a 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -256,6 +256,9 @@
   internal::UpperBidiagonalization<MatrixX> bid;
   MatrixX copyWorkspace;
   MatrixX reducedTriangle;
+  // Reused workspace for HouseholderSequence::applyThisOnTheLeft in copyUV().
+  // Without this, each apply allocates a fresh row vector.
+  Matrix<Scalar, 1, Dynamic, RowMajor> m_householderWorkspace;
 
   using Base::m_computationOptions;
   using Base::m_computeThinU;
@@ -363,9 +366,14 @@
   //**** step 2 - Divide & Conquer
   m_impl.naiveU().setZero();
   m_impl.naiveV().setZero();
-  // FIXME: this line involves a temporary matrix.
-  m_impl.computed().topRows(diagSize()) = bid.bidiagonal().toDenseMatrix().transpose();
-  m_impl.computed().template bottomRows<1>().setZero();
+  // The transposed bidiagonal has only the main diagonal and one sub-diagonal;
+  // fill those directly instead of materializing a dense temporary.
+  // Note: BandMatrix::diagonal<N>() const has a latent type bug (returns
+  // Block<CoefficientsType, ...> instead of Block<const CoefficientsType, ...>),
+  // so use the index-based overload which is correctly const-qualified.
+  m_impl.computed().setZero();
+  m_impl.computed().topRows(diagSize()).diagonal() = bid.bidiagonal().diagonal();
+  m_impl.computed().topRows(diagSize()).template diagonal<-1>() = bid.bidiagonal().diagonal(1);
   m_impl.divide(0, diagSize() - 1, 0, 0, 0);
   m_info = m_impl.info();
   m_numIters = m_impl.numIters();
@@ -410,28 +418,33 @@
 EIGEN_DONT_INLINE void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU,
                                                            const HouseholderV& householderV, const NaiveU& naiveU,
                                                            const NaiveV& naiveV) {
-  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
+  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa.
+  // Cast the diagSize x diagSize block (rather than the full naive matrix) to avoid materializing
+  // a full-size temporary when Scalar != RealScalar; reuse m_householderWorkspace across the two
+  // applyThisOnTheLeft calls so each does not allocate a fresh row vector.
   if (computeU()) {
     Index Ucols = m_computeThinU ? diagSize() : rows();
     m_matrixU = MatrixX::Identity(rows(), Ucols);
     m_matrixU.topLeftCorner(diagSize(), diagSize()) =
-        naiveV.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME: the following conditionals involve temporary buffers.
-    if (m_useQrDecomp)
-      m_matrixU.topLeftCorner(householderU.cols(), diagSize()).applyOnTheLeft(householderU);
-    else
-      m_matrixU.applyOnTheLeft(householderU);
+        naiveV.topLeftCorner(diagSize(), diagSize()).template cast<Scalar>();
+    if (m_useQrDecomp) {
+      auto sub = m_matrixU.topLeftCorner(householderU.cols(), diagSize());
+      householderU.applyThisOnTheLeft(sub, m_householderWorkspace);
+    } else {
+      householderU.applyThisOnTheLeft(m_matrixU, m_householderWorkspace);
+    }
   }
   if (computeV()) {
     Index Vcols = m_computeThinV ? diagSize() : cols();
     m_matrixV = MatrixX::Identity(cols(), Vcols);
     m_matrixV.topLeftCorner(diagSize(), diagSize()) =
-        naiveU.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
-    // FIXME: the following conditionals involve temporary buffers.
-    if (m_useQrDecomp)
-      m_matrixV.topLeftCorner(householderV.cols(), diagSize()).applyOnTheLeft(householderV);
-    else
-      m_matrixV.applyOnTheLeft(householderV);
+        naiveU.topLeftCorner(diagSize(), diagSize()).template cast<Scalar>();
+    if (m_useQrDecomp) {
+      auto sub = m_matrixV.topLeftCorner(householderV.cols(), diagSize());
+      householderV.applyThisOnTheLeft(sub, m_householderWorkspace);
+    } else {
+      householderV.applyThisOnTheLeft(m_matrixV, m_householderWorkspace);
+    }
   }
 }
 
diff --git a/Eigen/src/SVD/BDCSVDImpl.h b/Eigen/src/SVD/BDCSVDImpl.h
index 7ba5bbf..9b818fe 100644
--- a/Eigen/src/SVD/BDCSVDImpl.h
+++ b/Eigen/src/SVD/BDCSVDImpl.h
@@ -86,6 +86,10 @@
   MatrixXr m_computed;
   ArrayXr m_workspace;
   ArrayXi m_workspaceI;
+  // Reused base-case JacobiSVDs (one per option set) so that recursive divide()
+  // calls don't reallocate JacobiSVD's internal U/V/sigma buffers each time.
+  JacobiSVD<MatrixXr, ComputeFullU> m_baseSvdU;
+  JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> m_baseSvdUV;
   int m_algoswap;
   bool m_compU, m_compV;
   int m_numIters;
@@ -201,13 +205,10 @@
   // We use the other algorithm which is more efficient for small
   // matrices.
   if (n < m_algoswap) {
-    // FIXME: this block involves temporaries.
     if (m_compV) {
-      JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+      computeBaseCase(m_baseSvdUV, n, firstCol, firstRowW, firstColW, shift);
     } else {
-      JacobiSVD<MatrixXr, ComputeFullU> baseSvd;
-      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+      computeBaseCase(m_baseSvdU, n, firstCol, firstRowW, firstColW, shift);
     }
     return;
   }