Eigen/src/SVD/UpperBidiagonalization.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_BIDIAGONALIZATION_H
 #define EIGEN_BIDIAGONALIZATION_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 namespace Eigen {

 namespace internal {
 // UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
 // At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.

 template <typename MatrixType_>
 class UpperBidiagonalization {
  public:
   typedef MatrixType_ MatrixType;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     ColsAtCompileTimeMinusOne = internal::decrement_size<ColsAtCompileTime>::ret
   };
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
   typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
   typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
   typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
   typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
   typedef HouseholderSequence<
       const MatrixType, const internal::remove_all_t<typename Diagonal<const MatrixType, 0>::ConjugateReturnType> >
       HouseholderUSequenceType;
   typedef HouseholderSequence<const internal::remove_all_t<typename MatrixType::ConjugateReturnType>,
                               Diagonal<const MatrixType, 1>, OnTheRight>
       HouseholderVSequenceType;

   /**
    * \brief Default Constructor.
    *
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via Bidiagonalization::compute(const MatrixType&).
    */
   UpperBidiagonalization() : m_householder(), m_bidiagonal(0, 0), m_isInitialized(false) {}

   explicit UpperBidiagonalization(const MatrixType& matrix)
       : m_householder(matrix.rows(), matrix.cols()),
         m_bidiagonal(matrix.cols(), matrix.cols()),
         m_isInitialized(false) {
     compute(matrix);
   }

   UpperBidiagonalization(Index rows, Index cols)
       : m_householder(rows, cols), m_bidiagonal(cols, cols), m_isInitialized(false) {}

   UpperBidiagonalization& compute(const MatrixType& matrix);
   UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);

   const MatrixType& householder() const { return m_householder; }
   const BidiagonalType& bidiagonal() const { return m_bidiagonal; }

   const HouseholderUSequenceType householderU() const {
     eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
     return HouseholderUSequenceType(m_householder, m_householder.diagonal().conjugate());
   }

   const HouseholderVSequenceType householderV()  // const here gives nasty errors and i'm lazy
   {
     eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
     return HouseholderVSequenceType(m_householder.conjugate(), m_householder.const_derived().template diagonal<1>())
         .setLength(m_householder.cols() - 1)
         .setShift(1);
   }

  protected:
   MatrixType m_householder;
   BidiagonalType m_bidiagonal;
   bool m_isInitialized;
 };

 // Standard upper bidiagonalization without fancy optimizations
 // This version should be faster for small matrix size
 template <typename MatrixType>
 void upperbidiagonalization_inplace_unblocked(MatrixType& mat, typename MatrixType::RealScalar* diagonal,
                                               typename MatrixType::RealScalar* upper_diagonal,
                                               typename MatrixType::Scalar* tempData = 0) {
   typedef typename MatrixType::Scalar Scalar;

   Index rows = mat.rows();
   Index cols = mat.cols();

   typedef Matrix<Scalar, Dynamic, 1, ColMajor, MatrixType::MaxRowsAtCompileTime, 1> TempType;
   TempType tempVector;
   if (tempData == 0) {
     tempVector.resize(rows);
     tempData = tempVector.data();
   }

   for (Index k = 0; /* breaks at k==cols-1 below */; ++k) {
     Index remainingRows = rows - k;
     Index remainingCols = cols - k - 1;

     // construct left householder transform in-place in A
     mat.col(k).tail(remainingRows).makeHouseholderInPlace(mat.coeffRef(k, k), diagonal[k]);
     // apply householder transform to remaining part of A on the left
     mat.bottomRightCorner(remainingRows, remainingCols)
         .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows - 1), mat.coeff(k, k), tempData);

     if (k == cols - 1) break;

     // construct right householder transform in-place in mat
     mat.row(k).tail(remainingCols).makeHouseholderInPlace(mat.coeffRef(k, k + 1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows - 1, remainingCols)
         .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols - 1).adjoint(), mat.coeff(k, k + 1), tempData);
   }
 }

 /** \internal
  * Helper routine for the block reduction to upper bidiagonal form.
  *
  * Let's partition the matrix A:
  *
  *      | A00 A01 |
  *  A = |         |
  *      | A10 A11 |
  *
  * This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
  * and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
  * is updated using matrix-matrix products:
  *   A22 -= V * Y^T - X * U^T
  * where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
  * respectively, and the update matrices X and Y are computed during the reduction.
  *
  */
 template <typename MatrixType>
 void upperbidiagonalization_blocked_helper(
     MatrixType& A, typename MatrixType::RealScalar* diagonal, typename MatrixType::RealScalar* upper_diagonal, Index bs,
     Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > X,
     Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > Y) {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef typename NumTraits<RealScalar>::Literal Literal;
   static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
   typedef InnerStride<StorageOrder == ColMajor ? 1 : Dynamic> ColInnerStride;
   typedef InnerStride<StorageOrder == ColMajor ? Dynamic : 1> RowInnerStride;
   typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride> SubColumnType;
   typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride> SubRowType;
   typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > SubMatType;

   Index brows = A.rows();
   Index bcols = A.cols();

   Scalar tau_u, tau_u_prev(0), tau_v;

   for (Index k = 0; k < bs; ++k) {
     Index remainingRows = brows - k;
     Index remainingCols = bcols - k - 1;

     SubMatType X_k1(X.block(k, 0, remainingRows, k));
     SubMatType V_k1(A.block(k, 0, remainingRows, k));

     // 1 - update the k-th column of A
     SubColumnType v_k = A.col(k).tail(remainingRows);
     v_k -= V_k1 * Y.row(k).head(k).adjoint();
     if (k) v_k -= X_k1 * A.col(k).head(k);

     // 2 - construct left Householder transform in-place
     v_k.makeHouseholderInPlace(tau_v, diagonal[k]);

     if (k + 1 < bcols) {
       SubMatType Y_k(Y.block(k + 1, 0, remainingCols, k + 1));
       SubMatType U_k1(A.block(0, k + 1, k, remainingCols));

       // this eases the application of Householder transforAions
       // A(k,k) will store tau_v later
       A(k, k) = Scalar(1);

       // 3 - Compute y_k^T = tau_v * ( A^T*v_k - Y_k-1*V_k-1^T*v_k - U_k-1*X_k-1^T*v_k )
       {
         SubColumnType y_k(Y.col(k).tail(remainingCols));

         // let's use the beginning of column k of Y as a temporary vector
         SubColumnType tmp(Y.col(k).head(k));
         y_k.noalias() = A.block(k, k + 1, remainingRows, remainingCols).adjoint() * v_k;  // bottleneck
         tmp.noalias() = V_k1.adjoint() * v_k;
         y_k.noalias() -= Y_k.leftCols(k) * tmp;
         tmp.noalias() = X_k1.adjoint() * v_k;
         y_k.noalias() -= U_k1.adjoint() * tmp;
         y_k *= numext::conj(tau_v);
       }

       // 4 - update k-th row of A (it will become u_k)
       SubRowType u_k(A.row(k).tail(remainingCols));
       u_k = u_k.conjugate();
       {
         u_k -= Y_k * A.row(k).head(k + 1).adjoint();
         if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
       }

       // 5 - construct right Householder transform in-place
       u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);

       // this eases the application of Householder transformations
       // A(k,k+1) will store tau_u later
       A(k, k + 1) = Scalar(1);

       // 6 - Compute x_k = tau_u * ( A*u_k - X_k-1*U_k-1^T*u_k - V_k*Y_k^T*u_k )
       {
         SubColumnType x_k(X.col(k).tail(remainingRows - 1));

         // let's use the beginning of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0(X.col(k).head(k)), tmp1(X.col(k).head(k + 1));

         x_k.noalias() = A.block(k + 1, k + 1, remainingRows - 1, remainingCols) * u_k.transpose();  // bottleneck
         tmp0.noalias() = U_k1 * u_k.transpose();
         x_k.noalias() -= X_k1.bottomRows(remainingRows - 1) * tmp0;
         tmp1.noalias() = Y_k.adjoint() * u_k.transpose();
         x_k.noalias() -= A.block(k + 1, 0, remainingRows - 1, k + 1) * tmp1;
         x_k *= numext::conj(tau_u);
         tau_u = numext::conj(tau_u);
         u_k = u_k.conjugate();
       }

       if (k > 0) A.coeffRef(k - 1, k) = tau_u_prev;
       tau_u_prev = tau_u;
     } else
       A.coeffRef(k - 1, k) = tau_u_prev;

     A.coeffRef(k, k) = tau_v;
   }

   if (bs < bcols) A.coeffRef(bs - 1, bs) = tau_u_prev;

   // update A22
   if (bcols > bs && brows > bs) {
     SubMatType A11(A.bottomRightCorner(brows - bs, bcols - bs));
     SubMatType A10(A.block(bs, 0, brows - bs, bs));
     SubMatType A01(A.block(0, bs, bs, bcols - bs));
     Scalar tmp = A01(bs - 1, 0);
     A01(bs - 1, 0) = Literal(1);
     A11.noalias() -= A10 * Y.topLeftCorner(bcols, bs).bottomRows(bcols - bs).adjoint();
     A11.noalias() -= X.topLeftCorner(brows, bs).bottomRows(brows - bs) * A01;
     A01(bs - 1, 0) = tmp;
   }
 }

 /** \internal
  *
  * Implementation of a block-bidiagonal reduction.
  * It is based on the following paper:
  *   The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and
  * Bidiagonal Form. by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995) section 3.3
  */
 template <typename MatrixType, typename BidiagType>
 void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal, Index maxBlockSize = 32,
                                             typename MatrixType::Scalar* /*tempData*/ = 0) {
   typedef typename MatrixType::Scalar Scalar;
   typedef Block<MatrixType, Dynamic, Dynamic> BlockType;

   Index rows = A.rows();
   Index cols = A.cols();
   Index size = (std::min)(rows, cols);

   // X and Y are work space
   static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
   Matrix<Scalar, MatrixType::RowsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxRowsAtCompileTime> X(
       rows, maxBlockSize);
   Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxColsAtCompileTime> Y(
       cols, maxBlockSize);
   Index blockSize = (std::min)(maxBlockSize, size);

   Index k = 0;
   for (k = 0; k < size; k += blockSize) {
     Index bs = (std::min)(size - k, blockSize);  // actual size of the block
     Index brows = rows - k;                      // rows of the block
     Index bcols = cols - k;                      // columns of the block

     // partition the matrix A:
     //
     //      | A00 A01 A02 |
     //      |             |
     // A  = | A10 A11 A12 |
     //      |             |
     //      | A20 A21 A22 |
     //
     // where A11 is a bs x bs diagonal block,
     // and let:
     //      | A11 A12 |
     //  B = |         |
     //      | A21 A22 |

     BlockType B = A.block(k, k, brows, bcols);

     // This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
     // Finally, the algorithm continue on the updated A22.
     //
     // However, if B is too small, or A22 empty, then let's use an unblocked strategy

     auto upper_diagonal = bidiagonal.template diagonal<1>();
     typename MatrixType::RealScalar* upper_diagonal_ptr =
         upper_diagonal.size() > 0 ? &upper_diagonal.coeffRef(k) : nullptr;

     if (k + bs == cols || bcols < 48)  // somewhat arbitrary threshold
     {
       upperbidiagonalization_inplace_unblocked(B, &(bidiagonal.template diagonal<0>().coeffRef(k)), upper_diagonal_ptr,
                                                X.data());
       break;  // We're done
     } else {
       upperbidiagonalization_blocked_helper<BlockType>(B, &(bidiagonal.template diagonal<0>().coeffRef(k)),
                                                        upper_diagonal_ptr, bs, X.topLeftCorner(brows, bs),
                                                        Y.topLeftCorner(bcols, bs));
     }
   }
 }

 template <typename MatrixType_>
 UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::computeUnblocked(const MatrixType_& matrix) {
   Index rows = matrix.rows();
   Index cols = matrix.cols();
   EIGEN_ONLY_USED_FOR_DEBUG(cols);

   eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");

   m_householder = matrix;

   ColVectorType temp(rows);

   upperbidiagonalization_inplace_unblocked(m_householder, &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
                                            &(m_bidiagonal.template diagonal<1>().coeffRef(0)), temp.data());

   m_isInitialized = true;
   return *this;
 }

 template <typename MatrixType_>
 UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::compute(const MatrixType_& matrix) {
   Index rows = matrix.rows();
   Index cols = matrix.cols();
   EIGEN_ONLY_USED_FOR_DEBUG(rows);
   EIGEN_ONLY_USED_FOR_DEBUG(cols);

   eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");

   m_householder = matrix;
   upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);

   m_isInitialized = true;
   return *this;
 }

 #if 0
 /** \return the Householder QR decomposition of \c *this.
   *
   * \sa class Bidiagonalization
   */
 template<typename Derived>
 const UpperBidiagonalization<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::bidiagonalization() const
 {
   return UpperBidiagonalization<PlainObject>(eval());
 }
 #endif

 }  // end namespace internal

 }  // end namespace Eigen

 #endif  // EIGEN_BIDIAGONALIZATION_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
	// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_BIDIAGONALIZATION_H
	#define EIGEN_BIDIAGONALIZATION_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	namespace Eigen {

	namespace internal {
	// UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
	// At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.

	template <typename MatrixType_>
	class UpperBidiagonalization {
	public:
	typedef MatrixType_ MatrixType;
	enum {
	RowsAtCompileTime = MatrixType::RowsAtCompileTime,
	ColsAtCompileTime = MatrixType::ColsAtCompileTime,
	ColsAtCompileTimeMinusOne = internal::decrement_size<ColsAtCompileTime>::ret
	};
	typedef typename MatrixType::Scalar Scalar;
	typedef typename MatrixType::RealScalar RealScalar;
	typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
	typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
	typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
	typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
	typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
	typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
	typedef HouseholderSequence<
	const MatrixType, const internal::remove_all_t<typename Diagonal<const MatrixType, 0>::ConjugateReturnType> >
	HouseholderUSequenceType;
	typedef HouseholderSequence<const internal::remove_all_t<typename MatrixType::ConjugateReturnType>,
	Diagonal<const MatrixType, 1>, OnTheRight>
	HouseholderVSequenceType;

	/**
	* \brief Default Constructor.
	*
	* The default constructor is useful in cases in which the user intends to
	* perform decompositions via Bidiagonalization::compute(const MatrixType&).
	*/
	UpperBidiagonalization() : m_householder(), m_bidiagonal(0, 0), m_isInitialized(false) {}

	explicit UpperBidiagonalization(const MatrixType& matrix)
	: m_householder(matrix.rows(), matrix.cols()),
	m_bidiagonal(matrix.cols(), matrix.cols()),
	m_isInitialized(false) {
	compute(matrix);
	}

	UpperBidiagonalization(Index rows, Index cols)
	: m_householder(rows, cols), m_bidiagonal(cols, cols), m_isInitialized(false) {}

	UpperBidiagonalization& compute(const MatrixType& matrix);
	UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);

	const MatrixType& householder() const { return m_householder; }
	const BidiagonalType& bidiagonal() const { return m_bidiagonal; }

	const HouseholderUSequenceType householderU() const {
	eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
	return HouseholderUSequenceType(m_householder, m_householder.diagonal().conjugate());
	}

	const HouseholderVSequenceType householderV() // const here gives nasty errors and i'm lazy
	{
	eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
	return HouseholderVSequenceType(m_householder.conjugate(), m_householder.const_derived().template diagonal<1>())
	.setLength(m_householder.cols() - 1)
	.setShift(1);
	}

	protected:
	MatrixType m_householder;
	BidiagonalType m_bidiagonal;
	bool m_isInitialized;
	};

	// Standard upper bidiagonalization without fancy optimizations
	// This version should be faster for small matrix size
	template <typename MatrixType>
	void upperbidiagonalization_inplace_unblocked(MatrixType& mat, typename MatrixType::RealScalar* diagonal,
	typename MatrixType::RealScalar* upper_diagonal,
	typename MatrixType::Scalar* tempData = 0) {
	typedef typename MatrixType::Scalar Scalar;

	Index rows = mat.rows();
	Index cols = mat.cols();

	typedef Matrix<Scalar, Dynamic, 1, ColMajor, MatrixType::MaxRowsAtCompileTime, 1> TempType;
	TempType tempVector;
	if (tempData == 0) {
	tempVector.resize(rows);
	tempData = tempVector.data();
	}

	for (Index k = 0; /* breaks at k==cols-1 below */; ++k) {
	Index remainingRows = rows - k;
	Index remainingCols = cols - k - 1;

	// construct left householder transform in-place in A
	mat.col(k).tail(remainingRows).makeHouseholderInPlace(mat.coeffRef(k, k), diagonal[k]);
	// apply householder transform to remaining part of A on the left
	mat.bottomRightCorner(remainingRows, remainingCols)
	.applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows - 1), mat.coeff(k, k), tempData);

	if (k == cols - 1) break;

	// construct right householder transform in-place in mat
	mat.row(k).tail(remainingCols).makeHouseholderInPlace(mat.coeffRef(k, k + 1), upper_diagonal[k]);
	// apply householder transform to remaining part of mat on the left
	mat.bottomRightCorner(remainingRows - 1, remainingCols)
	.applyHouseholderOnTheRight(mat.row(k).tail(remainingCols - 1).adjoint(), mat.coeff(k, k + 1), tempData);
	}
	}

	/** \internal
	* Helper routine for the block reduction to upper bidiagonal form.
	*
	* Let's partition the matrix A:
	*
	* \| A00 A01 \|
	* A = \| \|
	* \| A10 A11 \|
	*
	* This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
	* and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
	* is updated using matrix-matrix products:
	* A22 -= V * Y^T - X * U^T
	* where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
	* respectively, and the update matrices X and Y are computed during the reduction.
	*
	*/
	template <typename MatrixType>
	void upperbidiagonalization_blocked_helper(
	MatrixType& A, typename MatrixType::RealScalar* diagonal, typename MatrixType::RealScalar* upper_diagonal, Index bs,
	Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > X,
	Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > Y) {
	typedef typename MatrixType::Scalar Scalar;
	typedef typename MatrixType::RealScalar RealScalar;
	typedef typename NumTraits<RealScalar>::Literal Literal;
	static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
	typedef InnerStride<StorageOrder == ColMajor ? 1 : Dynamic> ColInnerStride;
	typedef InnerStride<StorageOrder == ColMajor ? Dynamic : 1> RowInnerStride;
	typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride> SubColumnType;
	typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride> SubRowType;
	typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > SubMatType;

	Index brows = A.rows();
	Index bcols = A.cols();

	Scalar tau_u, tau_u_prev(0), tau_v;

	for (Index k = 0; k < bs; ++k) {
	Index remainingRows = brows - k;
	Index remainingCols = bcols - k - 1;

	SubMatType X_k1(X.block(k, 0, remainingRows, k));
	SubMatType V_k1(A.block(k, 0, remainingRows, k));

	// 1 - update the k-th column of A
	SubColumnType v_k = A.col(k).tail(remainingRows);
	v_k -= V_k1 * Y.row(k).head(k).adjoint();
	if (k) v_k -= X_k1 * A.col(k).head(k);

	// 2 - construct left Householder transform in-place
	v_k.makeHouseholderInPlace(tau_v, diagonal[k]);

	if (k + 1 < bcols) {
	SubMatType Y_k(Y.block(k + 1, 0, remainingCols, k + 1));
	SubMatType U_k1(A.block(0, k + 1, k, remainingCols));

	// this eases the application of Householder transforAions
	// A(k,k) will store tau_v later
	A(k, k) = Scalar(1);

	// 3 - Compute y_k^T = tau_v * ( A^Tv_k - Y_k-1V_k-1^Tv_k - U_k-1X_k-1^T*v_k )
	{
	SubColumnType y_k(Y.col(k).tail(remainingCols));

	// let's use the beginning of column k of Y as a temporary vector
	SubColumnType tmp(Y.col(k).head(k));
	y_k.noalias() = A.block(k, k + 1, remainingRows, remainingCols).adjoint() * v_k; // bottleneck
	tmp.noalias() = V_k1.adjoint() * v_k;
	y_k.noalias() -= Y_k.leftCols(k) * tmp;
	tmp.noalias() = X_k1.adjoint() * v_k;
	y_k.noalias() -= U_k1.adjoint() * tmp;
	y_k *= numext::conj(tau_v);
	}

	// 4 - update k-th row of A (it will become u_k)
	SubRowType u_k(A.row(k).tail(remainingCols));
	u_k = u_k.conjugate();
	{
	u_k -= Y_k * A.row(k).head(k + 1).adjoint();
	if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
	}

	// 5 - construct right Householder transform in-place
	u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);

	// this eases the application of Householder transformations
	// A(k,k+1) will store tau_u later
	A(k, k + 1) = Scalar(1);

	// 6 - Compute x_k = tau_u * ( Au_k - X_k-1U_k-1^Tu_k - V_kY_k^T*u_k )
	{
	SubColumnType x_k(X.col(k).tail(remainingRows - 1));

	// let's use the beginning of column k of X as a temporary vectors
	// note that tmp0 and tmp1 overlaps
	SubColumnType tmp0(X.col(k).head(k)), tmp1(X.col(k).head(k + 1));

	x_k.noalias() = A.block(k + 1, k + 1, remainingRows - 1, remainingCols) * u_k.transpose(); // bottleneck
	tmp0.noalias() = U_k1 * u_k.transpose();
	x_k.noalias() -= X_k1.bottomRows(remainingRows - 1) * tmp0;
	tmp1.noalias() = Y_k.adjoint() * u_k.transpose();
	x_k.noalias() -= A.block(k + 1, 0, remainingRows - 1, k + 1) * tmp1;
	x_k *= numext::conj(tau_u);
	tau_u = numext::conj(tau_u);
	u_k = u_k.conjugate();
	}

	if (k > 0) A.coeffRef(k - 1, k) = tau_u_prev;
	tau_u_prev = tau_u;
	} else
	A.coeffRef(k - 1, k) = tau_u_prev;

	A.coeffRef(k, k) = tau_v;
	}

	if (bs < bcols) A.coeffRef(bs - 1, bs) = tau_u_prev;

	// update A22
	if (bcols > bs && brows > bs) {
	SubMatType A11(A.bottomRightCorner(brows - bs, bcols - bs));
	SubMatType A10(A.block(bs, 0, brows - bs, bs));
	SubMatType A01(A.block(0, bs, bs, bcols - bs));
	Scalar tmp = A01(bs - 1, 0);
	A01(bs - 1, 0) = Literal(1);
	A11.noalias() -= A10 * Y.topLeftCorner(bcols, bs).bottomRows(bcols - bs).adjoint();
	A11.noalias() -= X.topLeftCorner(brows, bs).bottomRows(brows - bs) * A01;
	A01(bs - 1, 0) = tmp;
	}
	}

	/** \internal
	*
	* Implementation of a block-bidiagonal reduction.
	* It is based on the following paper:
	* The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and
	* Bidiagonal Form. by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995) section 3.3
	*/
	template <typename MatrixType, typename BidiagType>
	void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal, Index maxBlockSize = 32,
	typename MatrixType::Scalar* /tempData/ = 0) {
	typedef typename MatrixType::Scalar Scalar;
	typedef Block<MatrixType, Dynamic, Dynamic> BlockType;

	Index rows = A.rows();
	Index cols = A.cols();
	Index size = (std::min)(rows, cols);

	// X and Y are work space
	static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
	Matrix<Scalar, MatrixType::RowsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxRowsAtCompileTime> X(
	rows, maxBlockSize);
	Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxColsAtCompileTime> Y(
	cols, maxBlockSize);
	Index blockSize = (std::min)(maxBlockSize, size);

	Index k = 0;
	for (k = 0; k < size; k += blockSize) {
	Index bs = (std::min)(size - k, blockSize); // actual size of the block
	Index brows = rows - k; // rows of the block
	Index bcols = cols - k; // columns of the block

	// partition the matrix A:
	//
	// \| A00 A01 A02 \|
	// \| \|
	// A = \| A10 A11 A12 \|
	// \| \|
	// \| A20 A21 A22 \|
	//
	// where A11 is a bs x bs diagonal block,
	// and let:
	// \| A11 A12 \|
	// B = \| \|
	// \| A21 A22 \|

	BlockType B = A.block(k, k, brows, bcols);

	// This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
	// Finally, the algorithm continue on the updated A22.
	//
	// However, if B is too small, or A22 empty, then let's use an unblocked strategy

	auto upper_diagonal = bidiagonal.template diagonal<1>();
	typename MatrixType::RealScalar* upper_diagonal_ptr =
	upper_diagonal.size() > 0 ? &upper_diagonal.coeffRef(k) : nullptr;

	if (k + bs == cols \|\| bcols < 48) // somewhat arbitrary threshold
	{
	upperbidiagonalization_inplace_unblocked(B, &(bidiagonal.template diagonal<0>().coeffRef(k)), upper_diagonal_ptr,
	X.data());
	break; // We're done
	} else {
	upperbidiagonalization_blocked_helper<BlockType>(B, &(bidiagonal.template diagonal<0>().coeffRef(k)),
	upper_diagonal_ptr, bs, X.topLeftCorner(brows, bs),
	Y.topLeftCorner(bcols, bs));
	}
	}
	}

	template <typename MatrixType_>
	UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::computeUnblocked(const MatrixType_& matrix) {
	Index rows = matrix.rows();
	Index cols = matrix.cols();
	EIGEN_ONLY_USED_FOR_DEBUG(cols);

	eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");

	m_householder = matrix;

	ColVectorType temp(rows);

	upperbidiagonalization_inplace_unblocked(m_householder, &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
	&(m_bidiagonal.template diagonal<1>().coeffRef(0)), temp.data());

	m_isInitialized = true;
	return *this;
	}

	template <typename MatrixType_>
	UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::compute(const MatrixType_& matrix) {
	Index rows = matrix.rows();
	Index cols = matrix.cols();
	EIGEN_ONLY_USED_FOR_DEBUG(rows);
	EIGEN_ONLY_USED_FOR_DEBUG(cols);

	eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");

	m_householder = matrix;
	upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);

	m_isInitialized = true;
	return *this;
	}

	#if 0
	/** \return the Householder QR decomposition of \c *this.
	*
	* \sa class Bidiagonalization
	*/
	template<typename Derived>
	const UpperBidiagonalization<typename MatrixBase<Derived>::PlainObject>
	MatrixBase<Derived>::bidiagonalization() const
	{
	return UpperBidiagonalization<PlainObject>(eval());
	}
	#endif

	} // end namespace internal

	} // end namespace Eigen

	#endif // EIGEN_BIDIAGONALIZATION_H