Merged in advanpix/eigen-mp-devs (pull request PR-32) Fixes for SparseMatrix to support non-POD scalar types

diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 358b318..3efdcfe 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h

@@ -21,6 +21,9 @@
   * \param XprType the type of the expression in which we are taking a block
   * \param BlockRows the number of rows of the block we are taking at compile time (optional)
   * \param BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *        to set of columns of a column major matrix (optional). The parameter allows to determine
+  *        at compile time whether aligned access is possible on the block expression.
   *
   * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
   * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and

diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h
index 887e718..419f901 100644
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h

@@ -208,9 +208,9 @@
   /** type of a vector */
   typedef Matrix<Scalar,Dim,1> VectorType;
   /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1,int(Mode)==(AffineCompact)> TranslationPart;
+  typedef Block<MatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> TranslationPart;
   /** type of a read reference to the translation part of the rotation */
-  typedef const Block<ConstMatrixType,Dim,1,int(Mode)==(AffineCompact)> ConstTranslationPart;
+  typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
   

diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 587de37..6cac3af 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h

@@ -31,7 +31,7 @@
     typedef typename MatrixType::Index Index;
     typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
     typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
-    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0> BidiagonalType;
+    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
     typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
     typedef HouseholderSequence<
@@ -61,6 +61,7 @@
     }
     
     UpperBidiagonalization& compute(const MatrixType& matrix);
+    UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);
     
     const MatrixType& householder() const { return m_householder; }
     const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
@@ -85,45 +86,294 @@
     bool m_isInitialized;
 };
 
-template<typename _MatrixType>
-UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+// Standard upper bidiagonalization without fancy optimizations
+// This version should be faster for small matrix size
+template<typename MatrixType>
+void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
+                                              typename MatrixType::RealScalar *diagonal,
+                                              typename MatrixType::RealScalar *upper_diagonal,
+                                              typename MatrixType::Scalar* tempData = 0)
 {
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
-  
-  m_householder = matrix;
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
 
-  ColVectorType temp(rows);
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+  Index size = (std::min)(rows, cols);
+
+  typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixType::MaxRowsAtCompileTime,1> TempType;
+  TempType tempVector;
+  if(tempData==0)
+  {
+    tempVector.resize(rows);
+    tempData = tempVector.data();
+  }
 
   for (Index k = 0; /* breaks at k==cols-1 below */ ; ++k)
   {
     Index remainingRows = rows - k;
     Index remainingCols = cols - k - 1;
 
-    // construct left householder transform in-place in m_householder
-    m_householder.col(k).tail(remainingRows)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k),
-                                         m_bidiagonal.template diagonal<0>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows, remainingCols)
-                 .applyHouseholderOnTheLeft(m_householder.col(k).tail(remainingRows-1),
-                                            m_householder.coeff(k,k),
-                                            temp.data());
+    // construct left householder transform in-place in A
+    mat.col(k).tail(remainingRows)
+       .makeHouseholderInPlace(mat.coeffRef(k,k), diagonal[k]);
+    // apply householder transform to remaining part of A on the left
+    mat.bottomRightCorner(remainingRows, remainingCols)
+       .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows-1), mat.coeff(k,k), tempData);
 
     if(k == cols-1) break;
-    
-    // construct right householder transform in-place in m_householder
-    m_householder.row(k).tail(remainingCols)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k+1),
-                                         m_bidiagonal.template diagonal<1>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows-1, remainingCols)
-                 .applyHouseholderOnTheRight(m_householder.row(k).tail(remainingCols-1).transpose(),
-                                             m_householder.coeff(k,k+1),
-                                             temp.data());
+
+    // construct right householder transform in-place in mat
+    mat.row(k).tail(remainingCols)
+       .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
+    // apply householder transform to remaining part of mat on the left
+    mat.bottomRightCorner(remainingRows-1, remainingCols)
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
   }
+}
+
+/** \internal
+  * Helper routine for the block reduction to upper bidiagonal form.
+  *
+  * Let's partition the matrix A:
+  * 
+  *      | A00 A01 |
+  *  A = |         |
+  *      | A10 A11 |
+  *
+  * This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
+  * and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
+  * is updated using matrix-matrix products:
+  *   A22 -= V * Y^T - X * U^T
+  * where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
+  * respectively, and the update matrices X and Y are computed during the reduction.
+  * 
+  */
+template<typename MatrixType>
+void upperbidiagonalization_blocked_helper(MatrixType& A,
+                                           typename MatrixType::RealScalar *diagonal,
+                                           typename MatrixType::RealScalar *upper_diagonal,
+                                           typename MatrixType::Index bs,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic> > X,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic> > Y)
+{
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Ref<Matrix<Scalar, Dynamic, 1> >                    SubColumnType;
+  typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, InnerStride<> >  SubRowType;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic> >              SubMatType;
+  
+  Index brows = A.rows();
+  Index bcols = A.cols();
+
+  Scalar tau_u, tau_u_prev(0), tau_v;
+
+  for(Index k = 0; k < bs; ++k)
+  {
+    Index remainingRows = brows - k;
+    Index remainingSizeInBlock = bs - k - 1;
+    Index remainingCols = bcols - k - 1;
+
+    SubMatType X_k1( X.block(k,0, remainingRows,k) );
+    SubMatType V_k1( A.block(k,0, remainingRows,k) );
+
+    // 1 - update the k-th column of A
+    SubColumnType v_k = A.col(k).tail(remainingRows);
+          v_k -= V_k1 * Y.row(k).head(k).adjoint();
+    if(k) v_k -= X_k1 * A.col(k).head(k);
+    
+    // 2 - construct left Householder transform in-place
+    v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
+       
+    if(k+1<bcols)
+    {
+      SubMatType Y_k  ( Y.block(k+1,0, remainingCols, k+1) );
+      SubMatType U_k1 ( A.block(0,k+1, k,remainingCols) );
+      
+      // this eases the application of Householder transforAions
+      // A(k,k) will store tau_v later
+      A(k,k) = Scalar(1);
+
+      // 3 - Compute y_k^T = tau_v * ( A^T*v_k - Y_k-1*V_k-1^T*v_k - U_k-1*X_k-1^T*v_k )
+      {
+        SubColumnType y_k( Y.col(k).tail(remainingCols) );
+        
+        // let's use the begining of column k of Y as a temporary vector
+        SubColumnType tmp( Y.col(k).head(k) );
+        y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
+        tmp.noalias()  = V_k1.adjoint()  * v_k;
+        y_k.noalias() -= Y_k.leftCols(k) * tmp;
+        tmp.noalias()  = X_k1.adjoint()  * v_k;
+        y_k.noalias() -= U_k1.adjoint()  * tmp;
+        y_k *= numext::conj(tau_v);
+      }
+
+      // 4 - update k-th row of A (it will become u_k)
+      SubRowType u_k( A.row(k).tail(remainingCols) );
+      u_k = u_k.conjugate();
+      {
+        u_k -= Y_k * A.row(k).head(k+1).adjoint();
+        if(k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
+      }
+
+      // 5 - construct right Householder transform in-placecols
+      u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);
+
+      // this eases the application of Householder transforAions
+      // A(k,k+1) will store tau_u later
+      A(k,k+1) = Scalar(1);
+
+      // 6 - Compute x_k = tau_u * ( A*u_k - X_k-1*U_k-1^T*u_k - V_k*Y_k^T*u_k )
+      {
+        SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
+        
+        // let's use the begining of column k of X as a temporary vectors
+        // note that tmp0 and tmp1 overlaps
+        SubColumnType tmp0 ( X.col(k).head(k) ),
+                      tmp1 ( X.col(k).head(k+1) );
+                    
+        x_k.noalias()   = A.block(k+1,k+1, remainingRows-1,remainingCols) * u_k.transpose(); // bottleneck
+        tmp0.noalias()  = U_k1 * u_k.transpose();
+        x_k.noalias()  -= X_k1.bottomRows(remainingRows-1) * tmp0;
+        tmp1.noalias()  = Y_k.adjoint() * u_k.transpose();
+        x_k.noalias()  -= A.block(k+1,0, remainingRows-1,k+1) * tmp1;
+        x_k *= numext::conj(tau_u);
+        tau_u = numext::conj(tau_u);
+        u_k = u_k.conjugate();
+      }
+
+      if(k>0) A.coeffRef(k-1,k) = tau_u_prev;
+      tau_u_prev = tau_u;
+    }
+    else
+      A.coeffRef(k-1,k) = tau_u_prev;
+
+    A.coeffRef(k,k) = tau_v;
+  }
+  
+  if(bs<bcols)
+    A.coeffRef(bs-1,bs) = tau_u_prev;
+
+  // update A22
+  if(bcols>bs && brows>bs)
+  {
+    SubMatType A11( A.bottomRightCorner(brows-bs,bcols-bs) );
+    SubMatType A10( A.block(bs,0, brows-bs,bs) );
+    SubMatType A01( A.block(0,bs, bs,bcols-bs) );
+    Scalar tmp = A01(bs-1,0);
+    A01(bs-1,0) = 1;
+    A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
+    A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
+    A01(bs-1,0) = tmp;
+  }
+}
+
+/** \internal
+  *
+  * Implementation of a block-bidiagonal reduction.
+  * It is based on the following paper:
+  *   The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and Bidiagonal Form.
+  *   by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995)
+  *   section 3.3
+  */
+template<typename MatrixType, typename BidiagType>
+void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal,
+                                            typename MatrixType::Index maxBlockSize=32,
+                                            typename MatrixType::Scalar* /*tempData*/ = 0)
+{
+  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+
+  Index rows = A.rows();
+  Index cols = A.cols();
+  Index size = (std::min)(rows, cols);
+
+  Matrix<Scalar,MatrixType::RowsAtCompileTime,Dynamic,ColMajor,MatrixType::MaxRowsAtCompileTime> X(rows,maxBlockSize);
+  Matrix<Scalar,MatrixType::ColsAtCompileTime,Dynamic,ColMajor,MatrixType::MaxColsAtCompileTime> Y(cols,maxBlockSize);
+  Index blockSize = (std::min)(maxBlockSize,size);
+
+  Index k = 0;
+  for(k = 0; k < size; k += blockSize)
+  {
+    Index bs = (std::min)(size-k,blockSize);  // actual size of the block
+    Index brows = rows - k;                   // rows of the block
+    Index bcols = cols - k;                   // columns of the block
+
+    // partition the matrix A:
+    // 
+    //      | A00 A01 A02 |
+    //      |             |
+    // A  = | A10 A11 A12 |
+    //      |             |
+    //      | A20 A21 A22 |
+    //
+    // where A11 is a bs x bs diagonal block,
+    // and let:
+    //      | A11 A12 |
+    //  B = |         |
+    //      | A21 A22 |
+
+    BlockType B = A.block(k,k,brows,bcols);
+    
+    // This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
+    // Finally, the algorithm continue on the updated A22.
+    //
+    // However, if B is too small, or A22 empty, then let's use an unblocked strategy
+    if(k+bs==cols || bcols<48) // somewhat arbitrary threshold
+    {
+      upperbidiagonalization_inplace_unblocked(B,
+                                               &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                               &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                               X.data()
+                                              );
+      break; // We're done
+    }
+    else
+    {
+      upperbidiagonalization_blocked_helper<BlockType>( B,
+                                                        &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                                        &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                                        bs,
+                                                        X.topLeftCorner(brows,bs),
+                                                        Y.topLeftCorner(bcols,bs)
+                                                      );
+    }
+  }
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::computeUnblocked(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+
+  ColVectorType temp(rows);
+
+  upperbidiagonalization_inplace_unblocked(m_householder,
+                                           &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
+                                           &(m_bidiagonal.template diagonal<1>().coeffRef(0)),
+                                           temp.data());
+
+  m_isInitialized = true;
+  return *this;
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+  upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
+            
   m_isInitialized = true;
   return *this;
 }

diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index efd4428..a0ba764 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h

@@ -394,6 +394,8 @@
   protected:
     friend class InnerIterator;
     friend class ReverseInnerIterator;
+    
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 
     typename XprType::Nested m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 6248eee..d46ccc1 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h

@@ -66,7 +66,7 @@
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
-int matrix_log_get_pade_degree(float normTminusI)
+inline int matrix_log_get_pade_degree(float normTminusI)
 {
   const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
             5.3149729967117310e-1 };
@@ -80,7 +80,7 @@
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
-int matrix_log_get_pade_degree(double normTminusI)
+inline int matrix_log_get_pade_degree(double normTminusI)
 {
   const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
             1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
@@ -94,7 +94,7 @@
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
-int matrix_log_get_pade_degree(long double normTminusI)
+inline int matrix_log_get_pade_degree(long double normTminusI)
 {
 #if   LDBL_MANT_DIG == 53         // double precision
   const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,

diff --git a/unsupported/Eigen/src/SVD/BDCSVD.h b/unsupported/Eigen/src/SVD/BDCSVD.h
index d5f0a3f..80006fd 100644
--- a/unsupported/Eigen/src/SVD/BDCSVD.h
+++ b/unsupported/Eigen/src/SVD/BDCSVD.h

@@ -70,6 +70,7 @@
   typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
   typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixXr;
   typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
 
   /** \brief Default Constructor.
    *
@@ -78,7 +79,7 @@
    */
   BDCSVD()
     : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
+      algoswap(ALGOSWAP), m_numIters(0)
   {}
 
 
@@ -90,7 +91,7 @@
    */
   BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0)
     : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
+      algoswap(ALGOSWAP), m_numIters(0)
   {
     allocate(rows, cols, computationOptions);
   }
@@ -107,7 +108,7 @@
    */
   BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
     : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
+      algoswap(ALGOSWAP), m_numIters(0)
   {
     compute(matrix, computationOptions);
   }
@@ -197,9 +198,14 @@
  
 private:
   void allocate(Index rows, Index cols, unsigned int computationOptions);
-  void divide (Index firstCol, Index lastCol, Index firstRowW, 
-	       Index firstColW, Index shift);
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
   void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayXr& col0, const ArrayXr& diag, VectorType& singVals,
+                       ArrayXr& shifts, ArrayXr& mus);
+  void perturbCol0(const ArrayXr& col0, const ArrayXr& diag, const VectorType& singVals,
+                   const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat);
+  void computeSingVecs(const ArrayXr& zhat, const ArrayXr& diag, const VectorType& singVals,
+                       const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V);
   void deflation43(Index firstCol, Index shift, Index i, Index size);
   void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
   void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
@@ -212,7 +218,9 @@
   Index nRec;
   int algoswap;
   bool isTranspose, compU, compV;
-  
+
+public:  
+  int m_numIters;
 }; //end class BDCSVD
 
 
@@ -484,12 +492,9 @@
 template <typename MatrixType>
 void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
-  using std::abs;
-  Block<MatrixXr> block = m_computed.block(firstCol, firstCol, n, n);
-
   // TODO Get rid of these copies (?)
-  Array<RealScalar, Dynamic, 1> col0 = m_computed.block(firstCol, firstCol, n, 1);
-  Array<RealScalar, Dynamic, 1> diag = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayXr col0 = m_computed.block(firstCol, firstCol, n, 1);
+  ArrayXr diag = m_computed.block(firstCol, firstCol, n, n).diagonal();
   diag(0) = 0;
 
   // compute singular values and vectors (in decreasing order)
@@ -499,7 +504,25 @@
 
   if (col0.hasNaN() || diag.hasNaN()) return;
 
-  Array<RealScalar, Dynamic, 1> shifts(n), mus(n);
+  ArrayXr shifts(n), mus(n), zhat(n);
+  computeSingVals(col0, diag, singVals, shifts, mus);
+  perturbCol0(col0, diag, singVals, shifts, mus, zhat);
+  computeSingVecs(zhat, diag, singVals, shifts, mus, U, V);
+
+  // Reverse order so that singular values in increased order
+  singVals.reverseInPlace();
+  U.leftCols(n) = U.leftCols(n).rowwise().reverse().eval();
+  if (compV) V = V.rowwise().reverse().eval();
+}
+
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVals(const ArrayXr& col0, const ArrayXr& diag, 
+                                         VectorType& singVals, ArrayXr& shifts, ArrayXr& mus)
+{
+  using std::abs;
+  using std::swap;
+
+  Index n = col0.size();
   for (Index k = 0; k < n; ++k) {
     if (col0(k) == 0) {
       // entry is deflated, so singular value is on diagonal
@@ -509,7 +532,7 @@
       continue;
     } 
 
-    // otherwise, use bisection to find singular value
+    // otherwise, use secular equation to find singular value
     RealScalar left = diag(k);
     RealScalar right = (k != n-1) ? diag(k+1) : (diag(n-1) + col0.matrix().norm());
 
@@ -518,49 +541,98 @@
     RealScalar fMid = 1 + (col0.square() / ((diag + mid) * (diag - mid))).sum();
 
     RealScalar shift;
-    if (k == 0 || fMid > 0) shift = left;
+    if (k == n-1 || fMid > 0) shift = left;
     else shift = right;
-
-    // measure everything relative to shifted
-    Array<RealScalar, Dynamic, 1> diagShifted = diag - shift;
-    RealScalar leftShifted, rightShifted;
+    
+    // measure everything relative to shift
+    ArrayXr diagShifted = diag - shift;
+    
+    // initial guess
+    RealScalar muPrev, muCur;
     if (shift == left) {
-      leftShifted = 1e-30;
-      if (k == 0) rightShifted = right - left;
-      else rightShifted = (right - left) * 0.6; // theoretically we can take 0.5, but let's be safe
+      muPrev = (right - left) * 0.1;
+      if (k == n-1) muCur = right - left;
+      else muCur = (right - left) * 0.5; 
     } else {
-      leftShifted = -(right - left) * 0.6;
-      rightShifted = -1e-30;
+      muPrev = -(right - left) * 0.1;
+      muCur = -(right - left) * 0.5;
     }
 
-    RealScalar fLeft = 1 + (col0.square() / ((diagShifted - leftShifted) * (diag + shift + leftShifted))).sum();
-    RealScalar fRight = 1 + (col0.square() / ((diagShifted - rightShifted) * (diag + shift + rightShifted))).sum();
-    assert(fLeft * fRight < 0);
-        
-    while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * (std::max)(abs(leftShifted), abs(rightShifted))) {
-      RealScalar midShifted = (leftShifted + rightShifted) / 2;
-      RealScalar fMid = 1 + (col0.square() / ((diagShifted - midShifted) * (diag + shift + midShifted))).sum();
-      if (fLeft * fMid < 0) {
-        rightShifted = midShifted;
-        fRight = fMid;
+    RealScalar fPrev = 1 + (col0.square() / ((diagShifted - muPrev) * (diag + shift + muPrev))).sum();
+    RealScalar fCur = 1 + (col0.square() / ((diagShifted - muCur) * (diag + shift + muCur))).sum();
+    if (abs(fPrev) < abs(fCur)) {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = false;
+    while (abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * (std::max)(abs(muCur), abs(muPrev)) && fCur != fPrev && !useBisection) {
+      ++m_numIters;
+
+      RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+      RealScalar b = fCur - a / muCur;
+
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = -a / b;
+      fCur = 1 + (col0.square() / ((diagShifted - muCur) * (diag + shift + muCur))).sum();
+      
+      if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true;
+      if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection) {
+      RealScalar leftShifted, rightShifted;
+      if (shift == left) {
+        leftShifted = 1e-30;
+        if (k == 0) rightShifted = right - left;
+        else rightShifted = (right - left) * 0.6; // theoretically we can take 0.5, but let's be safe
       } else {
-        leftShifted = midShifted;
-        fLeft = fMid;
+        leftShifted = -(right - left) * 0.6;
+        rightShifted = -1e-30;
       }
+      
+      RealScalar fLeft = 1 + (col0.square() / ((diagShifted - leftShifted) * (diag + shift + leftShifted))).sum();
+      RealScalar fRight = 1 + (col0.square() / ((diagShifted - rightShifted) * (diag + shift + rightShifted))).sum();
+      assert(fLeft * fRight < 0);
+      
+      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * (std::max)(abs(leftShifted), abs(rightShifted))) {
+        RealScalar midShifted = (leftShifted + rightShifted) / 2;
+        RealScalar fMid = 1 + (col0.square() / ((diagShifted - midShifted) * (diag + shift + midShifted))).sum();
+        if (fLeft * fMid < 0) {
+          rightShifted = midShifted;
+          fRight = fMid;
+        } else {
+          leftShifted = midShifted;
+          fLeft = fMid;
+        }
+      }
+
+      muCur = (leftShifted + rightShifted) / 2;
     }
       
-    singVals[k] = shift + (leftShifted + rightShifted) / 2;
+    singVals[k] = shift + muCur;
     shifts[k] = shift;
-    mus[k] = (leftShifted + rightShifted) / 2;
+    mus[k] = muCur;
 
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
   }
+}
 
-  // zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
-  Array<RealScalar, Dynamic, 1> zhat(n);
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename MatrixType>
+void BDCSVD<MatrixType>::perturbCol0
+   (const ArrayXr& col0, const ArrayXr& diag, const VectorType& singVals,
+    const ArrayXr& shifts, const ArrayXr& mus, ArrayXr& zhat)
+{
+  Index n = col0.size();
   for (Index k = 0; k < n; ++k) {
     if (col0(k) == 0) 
       zhat(k) = 0;
@@ -583,12 +655,15 @@
       else zhat(k) = -tmp;
     }
   }
+}
 
-  MatrixXr Mhat = MatrixXr::Zero(n,n);
-  Mhat.diagonal() = diag;
-  Mhat.col(0) = zhat;
-
-  // compute singular vectors
+// compute singular vectors
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVecs
+   (const ArrayXr& zhat, const ArrayXr& diag, const VectorType& singVals,
+    const ArrayXr& shifts, const ArrayXr& mus, MatrixXr& U, MatrixXr& V)
+{
+  Index n = zhat.size();
   for (Index k = 0; k < n; ++k) {
     if (zhat(k) == 0) {
       U.col(k) = VectorType::Unit(n+1, k);
@@ -606,11 +681,6 @@
     }
   }
   U.col(n) = VectorType::Unit(n+1, n);
-
-  // Reverse order so that singular values in increased order
-  singVals.reverseInPlace();
-  U.leftCols(n) = U.leftCols(n).rowwise().reverse().eval();
-  if (compV) V = V.rowwise().reverse().eval();
 }
 
 
@@ -692,8 +762,11 @@
 template <typename MatrixType>
 void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift){
   //condition 4.1
-  RealScalar EPS = NumTraits<RealScalar>::epsilon() * 10 * (std::max<RealScalar>(m_computed(firstCol + shift + 1, firstCol + shift + 1), m_computed(firstCol + k, firstCol + k)));
+  using std::sqrt;
   const Index length = lastCol + 1 - firstCol;
+  RealScalar norm1 = m_computed.block(firstCol+shift, firstCol+shift, length, 1).squaredNorm();
+  RealScalar norm2 = m_computed.block(firstCol+shift, firstCol+shift, length, length).diagonal().squaredNorm();
+  RealScalar EPS = 10 * NumTraits<RealScalar>::epsilon() * sqrt(norm1 + norm2);
   if (m_computed(firstCol + shift, firstCol + shift) < EPS){
     m_computed(firstCol + shift, firstCol + shift) = EPS;
   }