Eigen/src/Core/SolveTriangular.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra. Eigen itself is part of the KDE project.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
 // License as published by the Free Software Foundation; either
 // version 3 of the License, or (at your option) any later version.
 //
 // Alternatively, you can redistribute it and/or
 // modify it under the terms of the GNU General Public License as
 // published by the Free Software Foundation; either version 2 of
 // the License, or (at your option) any later version.
 //
 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU Lesser General Public
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.

 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H

 template<typename XprType> struct ei_is_part { enum {value=false}; };
 template<typename XprType, unsigned int Mode> struct ei_is_part<Part<XprType,Mode> > { enum {value=true}; };

 template<typename Lhs, typename Rhs,
   int TriangularPart = (int(Lhs::Flags) & LowerTriangularBit)
                      ? LowerTriangular
                      : (int(Lhs::Flags) & UpperTriangularBit)
                      ? UpperTriangular
                      : -1,
   int StorageOrder = ei_is_part<Lhs>::value ? -1  // this is to solve ambiguous specializations
                    : int(Lhs::Flags) & (RowMajorBit|SparseBit)
   >
 struct ei_solve_triangular_selector;

 // transform a Part xpr to a Flagged xpr
 template<typename Lhs, unsigned int LhsMode, typename Rhs, int UpLo, int StorageOrder>
 struct ei_solve_triangular_selector<Part<Lhs,LhsMode>,Rhs,UpLo,StorageOrder>
 {
   static void run(const Part<Lhs,LhsMode>& lhs, Rhs& other)
   {
     ei_solve_triangular_selector<Flagged<Lhs,LhsMode,0>,Rhs>::run(lhs._expression(), other);
   }
 };

 // forward substitution, row-major
 template<typename Lhs, typename Rhs, int UpLo>
 struct ei_solve_triangular_selector<Lhs,Rhs,UpLo,RowMajor|IsDense>
 {
   typedef typename Rhs::Scalar Scalar;
   static void run(const Lhs& lhs, Rhs& other)
   {
     const bool IsLowerTriangular = (UpLo==LowerTriangular);
     const int size = lhs.cols();
     /* We perform the inverse product per block of 4 rows such that we perfectly match
      * our optimized matrix * vector product. blockyStart represents the number of rows
      * we have process first using the non-block version.
      */
     int blockyStart = (std::max(size-5,0)/4)*4;
     if (IsLowerTriangular)
       blockyStart = size - blockyStart;
     else
       blockyStart -= 1;
     for(int c=0 ; c<other.cols() ; ++c)
     {
       // process first rows using the non block version
       if(!(Lhs::Flags & UnitDiagBit))
       {
         if (IsLowerTriangular)
           other.coeffRef(0,c) = other.coeff(0,c)/lhs.coeff(0, 0);
         else
           other.coeffRef(size-1,c) = other.coeff(size-1, c)/lhs.coeff(size-1, size-1);
       }
       for(int i=(IsLowerTriangular ? 1 : size-2); IsLowerTriangular ? i<blockyStart : i>blockyStart; i += (IsLowerTriangular ? 1 : -1) )
       {
         Scalar tmp = other.coeff(i,c)
           - (IsLowerTriangular ? ((lhs.row(i).start(i)) * other.col(c).start(i)).coeff(0,0)
                      : ((lhs.row(i).end(size-i-1)) * other.col(c).end(size-i-1)).coeff(0,0));
         if (Lhs::Flags & UnitDiagBit)
           other.coeffRef(i,c) = tmp;
         else
           other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
       }

       // now let's process the remaining rows 4 at once
       for(int i=blockyStart; IsLowerTriangular ? i<size : i>0; )
       {
         int startBlock = i;
         int endBlock = startBlock + (IsLowerTriangular ? 4 : -4);

         /* Process the i cols times 4 rows block, and keep the result in a temporary vector */
         // FIXME use fixed size block but take care to small fixed size matrices...
         Matrix<Scalar,Dynamic,1> btmp(4);
         if (IsLowerTriangular)
           btmp = lhs.block(startBlock,0,4,i) * other.col(c).start(i);
         else
           btmp = lhs.block(i-3,i+1,4,size-1-i) * other.col(c).end(size-1-i);

         /* Let's process the 4x4 sub-matrix as usual.
          * btmp stores the diagonal coefficients used to update the remaining part of the result.
          */
         {
           Scalar tmp = other.coeff(startBlock,c)-btmp.coeff(IsLowerTriangular?0:3);
           if (Lhs::Flags & UnitDiagBit)
             other.coeffRef(i,c) = tmp;
           else
             other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
         }

         i += IsLowerTriangular ? 1 : -1;
         for (;IsLowerTriangular ? i<endBlock : i>endBlock; i += IsLowerTriangular ? 1 : -1)
         {
           int remainingSize = IsLowerTriangular ? i-startBlock : startBlock-i;
           Scalar tmp = other.coeff(i,c)
             - btmp.coeff(IsLowerTriangular ? remainingSize : 3-remainingSize)
             - (   lhs.row(i).segment(IsLowerTriangular ? startBlock : i+1, remainingSize)
               * other.col(c).segment(IsLowerTriangular ? startBlock : i+1, remainingSize)).coeff(0,0);

           if (Lhs::Flags & UnitDiagBit)
             other.coeffRef(i,c) = tmp;
           else
             other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
         }
       }
     }
   }
 };

 // Implements the following configurations:
 //  - inv(LowerTriangular,         ColMajor) * Column vector
 //  - inv(LowerTriangular,UnitDiag,ColMajor) * Column vector
 //  - inv(UpperTriangular,         ColMajor) * Column vector
 //  - inv(UpperTriangular,UnitDiag,ColMajor) * Column vector
 template<typename Lhs, typename Rhs, int UpLo>
 struct ei_solve_triangular_selector<Lhs,Rhs,UpLo,ColMajor|IsDense>
 {
   typedef typename Rhs::Scalar Scalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
   enum { PacketSize =  ei_packet_traits<Scalar>::size };

   static void run(const Lhs& lhs, Rhs& other)
   {
     static const bool IsLowerTriangular = (UpLo==LowerTriangular);
     const int size = lhs.cols();
     for(int c=0 ; c<other.cols() ; ++c)
     {
       /* let's perform the inverse product per block of 4 columns such that we perfectly match
        * our optimized matrix * vector product. blockyEnd represents the number of rows
        * we can process using the block version.
        */
       int blockyEnd = (std::max(size-5,0)/4)*4;
       if (!IsLowerTriangular)
         blockyEnd = size-1 - blockyEnd;
       for(int i=IsLowerTriangular ? 0 : size-1; IsLowerTriangular ? i<blockyEnd : i>blockyEnd;)
       {
         /* Let's process the 4x4 sub-matrix as usual.
          * btmp stores the diagonal coefficients used to update the remaining part of the result.
          */
         int startBlock = i;
         int endBlock = startBlock + (IsLowerTriangular ? 4 : -4);
         Matrix<Scalar,4,1> btmp;
         for (;IsLowerTriangular ? i<endBlock : i>endBlock;
              i += IsLowerTriangular ? 1 : -1)
         {
           if(!(Lhs::Flags & UnitDiagBit))
             other.coeffRef(i,c) /= lhs.coeff(i,i);
           int remainingSize = IsLowerTriangular ? endBlock-i-1 : i-endBlock-1;
           if (remainingSize>0)
             other.col(c).segment((IsLowerTriangular ? i : endBlock) + 1, remainingSize) -=
                 other.coeffRef(i,c)
               * Block<Lhs,Dynamic,1>(lhs, (IsLowerTriangular ? i : endBlock) + 1, i, remainingSize, 1);
           btmp.coeffRef(IsLowerTriangular ? i-startBlock : remainingSize) = -other.coeffRef(i,c);
         }

         /* Now we can efficiently update the remaining part of the result as a matrix * vector product.
          * NOTE in order to reduce both compilation time and binary size, let's directly call
          * the fast product implementation. It is equivalent to the following code:
          *   other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
          *                                       * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
          */
         // FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ?
         // this is a more general problem though.
         ei_cache_friendly_product_colmajor_times_vector(
           IsLowerTriangular ? size-endBlock : endBlock+1,
           &(lhs.const_cast_derived().coeffRef(IsLowerTriangular ? endBlock : 0, IsLowerTriangular ? startBlock : endBlock+1)),
           lhs.stride(),
           btmp, &(other.coeffRef(IsLowerTriangular ? endBlock : 0, c)));
 // 				if (IsLowerTriangular)
 //           other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
 //                                           * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
 // 				else
 //           other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
 //                                           * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
       }

       /* Now we have to process the remaining part as usual */
       int i;
       for(i=blockyEnd; IsLowerTriangular ? i<size-1 : i>0; i += (IsLowerTriangular ? 1 : -1) )
       {
         if(!(Lhs::Flags & UnitDiagBit))
           other.coeffRef(i,c) /= lhs.coeff(i,i);

         /* NOTE we cannot use lhs.col(i).end(size-i-1) because Part::coeffRef gets called by .col() to
          * get the address of the start of the row
          */
         if(IsLowerTriangular)
           other.col(c).end(size-i-1) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, i+1,i, size-i-1,1);
         else
           other.col(c).start(i) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, 0,i, i, 1);
       }
       if(!(Lhs::Flags & UnitDiagBit))
         other.coeffRef(i,c) /= lhs.coeff(i,i);
     }
   }
 };

 /** "in-place" version of MatrixBase::solveTriangular() where the result is written in \a other
   *
   * See MatrixBase:solveTriangular() for the details.
   */
 template<typename Derived>
 template<typename OtherDerived>
 void MatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
 {
   ei_assert(derived().cols() == derived().rows());
   ei_assert(derived().cols() == other.rows());
   ei_assert(!(Flags & ZeroDiagBit));
   ei_assert(Flags & (UpperTriangularBit|LowerTriangularBit));

   enum { copy = ei_traits<OtherDerived>::Flags & RowMajorBit };

   typedef typename ei_meta_if<copy,
     typename ei_plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::ret OtherCopy;
   OtherCopy otherCopy(other.derived());

   ei_solve_triangular_selector<Derived, typename ei_unref<OtherCopy>::type>::run(derived(), otherCopy);

   if (copy)
     other = otherCopy;
 }

 /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
   *
   * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other.
   * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
   * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
   * is an upper (resp. lower) triangular matrix.
   *
   * It is required that \c *this be marked as either an upper or a lower triangular matrix, which
   * can be done by marked(), and that is automatically the case with expressions such as those returned
   * by extract().
   *
   * \addexample SolveTriangular \label How to solve a triangular system (aka. how to multiply the inverse of a triangular matrix by another one)
   *
   * Example: \include MatrixBase_marked.cpp
   * Output: \verbinclude MatrixBase_marked.out
   *
   * This function is essentially a wrapper to the faster solveTriangularInPlace() function creating
   * a temporary copy of \a other, calling solveTriangularInPlace() on the copy and returning it.
   * Therefore, if \a other is not needed anymore, it is quite faster to call solveTriangularInPlace()
   * instead of solveTriangular().
   *
   * For users coming from BLAS, this function (and more specifically solveTriangularInPlace()) offer
   * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
   *
   * \b Tips: to perform a \em "right-inverse-multiply" you can simply transpose the operation, e.g.:
   * \code
   * M * T^1  <=>  T.transpose().solveTriangularInPlace(M.transpose());
   * \endcode
   *
   * \sa solveTriangularInPlace(), marked(), extract()
   */
 template<typename Derived>
 template<typename OtherDerived>
 typename ei_plain_matrix_type_column_major<OtherDerived>::type
 MatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
 {
   typename ei_plain_matrix_type_column_major<OtherDerived>::type res(other);
   solveTriangularInPlace(res);
   return res;
 }

 #endif // EIGEN_SOLVETRIANGULAR_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra. Eigen itself is part of the KDE project.
	//
	// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
	//
	// Eigen is free software; you can redistribute it and/or
	// modify it under the terms of the GNU Lesser General Public
	// License as published by the Free Software Foundation; either
	// version 3 of the License, or (at your option) any later version.
	//
	// Alternatively, you can redistribute it and/or
	// modify it under the terms of the GNU General Public License as
	// published by the Free Software Foundation; either version 2 of
	// the License, or (at your option) any later version.
	//
	// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
	// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
	// GNU General Public License for more details.
	//
	// You should have received a copy of the GNU Lesser General Public
	// License and a copy of the GNU General Public License along with
	// Eigen. If not, see <http://www.gnu.org/licenses/>.

	#ifndef EIGEN_SOLVETRIANGULAR_H
	#define EIGEN_SOLVETRIANGULAR_H

	template<typename XprType> struct ei_is_part { enum {value=false}; };
	template<typename XprType, unsigned int Mode> struct ei_is_part<Part<XprType,Mode> > { enum {value=true}; };

	template<typename Lhs, typename Rhs,
	int TriangularPart = (int(Lhs::Flags) & LowerTriangularBit)
	? LowerTriangular
	: (int(Lhs::Flags) & UpperTriangularBit)
	? UpperTriangular
	: -1,
	int StorageOrder = ei_is_part<Lhs>::value ? -1 // this is to solve ambiguous specializations
	: int(Lhs::Flags) & (RowMajorBit\|SparseBit)
	>
	struct ei_solve_triangular_selector;

	// transform a Part xpr to a Flagged xpr
	template<typename Lhs, unsigned int LhsMode, typename Rhs, int UpLo, int StorageOrder>
	struct ei_solve_triangular_selector<Part<Lhs,LhsMode>,Rhs,UpLo,StorageOrder>
	{
	static void run(const Part<Lhs,LhsMode>& lhs, Rhs& other)
	{
	ei_solve_triangular_selector<Flagged<Lhs,LhsMode,0>,Rhs>::run(lhs._expression(), other);
	}
	};

	// forward substitution, row-major
	template<typename Lhs, typename Rhs, int UpLo>
	struct ei_solve_triangular_selector<Lhs,Rhs,UpLo,RowMajor\|IsDense>
	{
	typedef typename Rhs::Scalar Scalar;
	static void run(const Lhs& lhs, Rhs& other)
	{
	const bool IsLowerTriangular = (UpLo==LowerTriangular);
	const int size = lhs.cols();
	/* We perform the inverse product per block of 4 rows such that we perfectly match
	* our optimized matrix * vector product. blockyStart represents the number of rows
	* we have process first using the non-block version.
	*/
	int blockyStart = (std::max(size-5,0)/4)*4;
	if (IsLowerTriangular)
	blockyStart = size - blockyStart;
	else
	blockyStart -= 1;
	for(int c=0 ; c<other.cols() ; ++c)
	{
	// process first rows using the non block version
	if(!(Lhs::Flags & UnitDiagBit))
	{
	if (IsLowerTriangular)
	other.coeffRef(0,c) = other.coeff(0,c)/lhs.coeff(0, 0);
	else
	other.coeffRef(size-1,c) = other.coeff(size-1, c)/lhs.coeff(size-1, size-1);
	}
	for(int i=(IsLowerTriangular ? 1 : size-2); IsLowerTriangular ? i<blockyStart : i>blockyStart; i += (IsLowerTriangular ? 1 : -1) )
	{
	Scalar tmp = other.coeff(i,c)
	- (IsLowerTriangular ? ((lhs.row(i).start(i)) * other.col(c).start(i)).coeff(0,0)
	: ((lhs.row(i).end(size-i-1)) * other.col(c).end(size-i-1)).coeff(0,0));
	if (Lhs::Flags & UnitDiagBit)
	other.coeffRef(i,c) = tmp;
	else
	other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
	}

	// now let's process the remaining rows 4 at once
	for(int i=blockyStart; IsLowerTriangular ? i<size : i>0; )
	{
	int startBlock = i;
	int endBlock = startBlock + (IsLowerTriangular ? 4 : -4);

	/* Process the i cols times 4 rows block, and keep the result in a temporary vector */
	// FIXME use fixed size block but take care to small fixed size matrices...
	Matrix<Scalar,Dynamic,1> btmp(4);
	if (IsLowerTriangular)
	btmp = lhs.block(startBlock,0,4,i) * other.col(c).start(i);
	else
	btmp = lhs.block(i-3,i+1,4,size-1-i) * other.col(c).end(size-1-i);

	/* Let's process the 4x4 sub-matrix as usual.
	* btmp stores the diagonal coefficients used to update the remaining part of the result.
	*/
	{
	Scalar tmp = other.coeff(startBlock,c)-btmp.coeff(IsLowerTriangular?0:3);
	if (Lhs::Flags & UnitDiagBit)
	other.coeffRef(i,c) = tmp;
	else
	other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
	}

	i += IsLowerTriangular ? 1 : -1;
	for (;IsLowerTriangular ? i<endBlock : i>endBlock; i += IsLowerTriangular ? 1 : -1)
	{
	int remainingSize = IsLowerTriangular ? i-startBlock : startBlock-i;
	Scalar tmp = other.coeff(i,c)
	- btmp.coeff(IsLowerTriangular ? remainingSize : 3-remainingSize)
	- ( lhs.row(i).segment(IsLowerTriangular ? startBlock : i+1, remainingSize)
	* other.col(c).segment(IsLowerTriangular ? startBlock : i+1, remainingSize)).coeff(0,0);

	if (Lhs::Flags & UnitDiagBit)
	other.coeffRef(i,c) = tmp;
	else
	other.coeffRef(i,c) = tmp/lhs.coeff(i,i);
	}
	}
	}
	}
	};

	// Implements the following configurations:
	// - inv(LowerTriangular, ColMajor) * Column vector
	// - inv(LowerTriangular,UnitDiag,ColMajor) * Column vector
	// - inv(UpperTriangular, ColMajor) * Column vector
	// - inv(UpperTriangular,UnitDiag,ColMajor) * Column vector
	template<typename Lhs, typename Rhs, int UpLo>
	struct ei_solve_triangular_selector<Lhs,Rhs,UpLo,ColMajor\|IsDense>
	{
	typedef typename Rhs::Scalar Scalar;
	typedef typename ei_packet_traits<Scalar>::type Packet;
	enum { PacketSize = ei_packet_traits<Scalar>::size };

	static void run(const Lhs& lhs, Rhs& other)
	{
	static const bool IsLowerTriangular = (UpLo==LowerTriangular);
	const int size = lhs.cols();
	for(int c=0 ; c<other.cols() ; ++c)
	{
	/* let's perform the inverse product per block of 4 columns such that we perfectly match
	* our optimized matrix * vector product. blockyEnd represents the number of rows
	* we can process using the block version.
	*/
	int blockyEnd = (std::max(size-5,0)/4)*4;
	if (!IsLowerTriangular)
	blockyEnd = size-1 - blockyEnd;
	for(int i=IsLowerTriangular ? 0 : size-1; IsLowerTriangular ? i<blockyEnd : i>blockyEnd;)
	{
	/* Let's process the 4x4 sub-matrix as usual.
	* btmp stores the diagonal coefficients used to update the remaining part of the result.
	*/
	int startBlock = i;
	int endBlock = startBlock + (IsLowerTriangular ? 4 : -4);
	Matrix<Scalar,4,1> btmp;
	for (;IsLowerTriangular ? i<endBlock : i>endBlock;
	i += IsLowerTriangular ? 1 : -1)
	{
	if(!(Lhs::Flags & UnitDiagBit))
	other.coeffRef(i,c) /= lhs.coeff(i,i);
	int remainingSize = IsLowerTriangular ? endBlock-i-1 : i-endBlock-1;
	if (remainingSize>0)
	other.col(c).segment((IsLowerTriangular ? i : endBlock) + 1, remainingSize) -=
	other.coeffRef(i,c)
	* Block<Lhs,Dynamic,1>(lhs, (IsLowerTriangular ? i : endBlock) + 1, i, remainingSize, 1);
	btmp.coeffRef(IsLowerTriangular ? i-startBlock : remainingSize) = -other.coeffRef(i,c);
	}

	/* Now we can efficiently update the remaining part of the result as a matrix * vector product.
	* NOTE in order to reduce both compilation time and binary size, let's directly call
	* the fast product implementation. It is equivalent to the following code:
	* other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
	* * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
	*/
	// FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ?
	// this is a more general problem though.
	ei_cache_friendly_product_colmajor_times_vector(
	IsLowerTriangular ? size-endBlock : endBlock+1,
	&(lhs.const_cast_derived().coeffRef(IsLowerTriangular ? endBlock : 0, IsLowerTriangular ? startBlock : endBlock+1)),
	lhs.stride(),
	btmp, &(other.coeffRef(IsLowerTriangular ? endBlock : 0, c)));
	// if (IsLowerTriangular)
	// other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
	// * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
	// else
	// other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock)
	// * other.col(c).block(startBlock,endBlock-startBlock)).lazy();
	}

	/* Now we have to process the remaining part as usual */
	int i;
	for(i=blockyEnd; IsLowerTriangular ? i<size-1 : i>0; i += (IsLowerTriangular ? 1 : -1) )
	{
	if(!(Lhs::Flags & UnitDiagBit))
	other.coeffRef(i,c) /= lhs.coeff(i,i);

	/* NOTE we cannot use lhs.col(i).end(size-i-1) because Part::coeffRef gets called by .col() to
	* get the address of the start of the row
	*/
	if(IsLowerTriangular)
	other.col(c).end(size-i-1) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, i+1,i, size-i-1,1);
	else
	other.col(c).start(i) -= other.coeffRef(i,c) * Block<Lhs,Dynamic,1>(lhs, 0,i, i, 1);
	}
	if(!(Lhs::Flags & UnitDiagBit))
	other.coeffRef(i,c) /= lhs.coeff(i,i);
	}
	}
	};

	/** "in-place" version of MatrixBase::solveTriangular() where the result is written in \a other
	*
	* See MatrixBase:solveTriangular() for the details.
	*/
	template<typename Derived>
	template<typename OtherDerived>
	void MatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
	{
	ei_assert(derived().cols() == derived().rows());
	ei_assert(derived().cols() == other.rows());
	ei_assert(!(Flags & ZeroDiagBit));
	ei_assert(Flags & (UpperTriangularBit\|LowerTriangularBit));

	enum { copy = ei_traits<OtherDerived>::Flags & RowMajorBit };

	typedef typename ei_meta_if<copy,
	typename ei_plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::ret OtherCopy;
	OtherCopy otherCopy(other.derived());

	ei_solve_triangular_selector<Derived, typename ei_unref<OtherCopy>::type>::run(derived(), otherCopy);

	if (copy)
	other = otherCopy;
	}

	/** \returns the product of the inverse of \c this with \a other, \a this being triangular.
	*
	* This function computes the inverse-matrix matrix product inverse(\c this) \a other.
	* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
	* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
	* is an upper (resp. lower) triangular matrix.
	*
	* It is required that \c *this be marked as either an upper or a lower triangular matrix, which
	* can be done by marked(), and that is automatically the case with expressions such as those returned
	* by extract().
	*
	* \addexample SolveTriangular \label How to solve a triangular system (aka. how to multiply the inverse of a triangular matrix by another one)
	*
	* Example: \include MatrixBase_marked.cpp
	* Output: \verbinclude MatrixBase_marked.out
	*
	* This function is essentially a wrapper to the faster solveTriangularInPlace() function creating
	* a temporary copy of \a other, calling solveTriangularInPlace() on the copy and returning it.
	* Therefore, if \a other is not needed anymore, it is quite faster to call solveTriangularInPlace()
	* instead of solveTriangular().
	*
	* For users coming from BLAS, this function (and more specifically solveTriangularInPlace()) offer
	* all the operations supported by the \c TRSV and \c TRSM BLAS routines.
	*
	* \b Tips: to perform a \em "right-inverse-multiply" you can simply transpose the operation, e.g.:
	* \code
	* M * T^1 <=> T.transpose().solveTriangularInPlace(M.transpose());
	* \endcode
	*
	* \sa solveTriangularInPlace(), marked(), extract()
	*/
	template<typename Derived>
	template<typename OtherDerived>
	typename ei_plain_matrix_type_column_major<OtherDerived>::type
	MatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
	{
	typename ei_plain_matrix_type_column_major<OtherDerived>::type res(other);
	solveTriangularInPlace(res);
	return res;
	}

	#endif // EIGEN_SOLVETRIANGULAR_H