add blocked LLT, and bugfix in trsm asserts

diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 4527067..58ac0c1 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h

@@ -116,80 +116,81 @@
     bool m_isInitialized;
 };
 
-template<typename MatrixType>
-bool ei_inplace_llt_lo(MatrixType& mat)
+// forward declaration (defined at the end of this file)
+template<int UpLo> struct ei_llt_inplace;
+
+template<> struct ei_llt_inplace<LowerTriangular>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  assert(mat.rows()==mat.cols());
-  const int size = mat.rows();
-
-  // The biggest overall is the point of reference to which further diagonals
-  // are compared; if any diagonal is negligible compared
-  // to the largest overall, the algorithm bails.  This cutoff is suggested
-  // in "Analysis of the Cholesky Decomposition of a Semi-definite Matrix" by
-  // Nicholas J. Higham. Also see "Accuracy and Stability of Numerical
-  // Algorithms" page 217, also by Higham.
-  const RealScalar cutoff = machine_epsilon<Scalar>() * size * mat.diagonal().cwise().abs().maxCoeff();
-  RealScalar x;
-  x = ei_real(mat.coeff(0,0));
-  mat.coeffRef(0,0) = ei_sqrt(x);
-  if(size==1)
+  template<typename MatrixType>
+  static bool unblocked(MatrixType& mat)
   {
-    return true;
-  }
-  mat.col(0).end(size-1) = mat.col(0).end(size-1) / ei_real(mat.coeff(0,0));
-  for (int j = 1; j < size; ++j)
-  {
-    x = ei_real(mat.coeff(j,j)) - mat.row(j).start(j).squaredNorm();
-    if (ei_abs(x) < cutoff) continue;
-
-    mat.coeffRef(j,j) = x = ei_sqrt(x);
-
-    int endSize = size-j-1;
-    if (endSize>0)
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    ei_assert(mat.rows()==mat.cols());
+    const int size = mat.rows();
+    for(int k = 0; k < size; ++k)
     {
-      mat.col(j).end(endSize) -= (mat.block(j+1, 0, endSize, j) * mat.row(j).start(j).adjoint()).lazy();
-      mat.col(j).end(endSize) *= RealScalar(1)/x;
+      int rs = size-k-1; // remaining size
+
+      Block<MatrixType,Dynamic,1> A21(mat,k+1,k,rs,1);
+      Block<MatrixType,1,Dynamic> A10(mat,k,0,1,k);
+      Block<MatrixType,Dynamic,Dynamic> A20(mat,k+1,0,rs,k);
+
+      RealScalar x = ei_real(mat.coeff(k,k));
+      if (k>0) x -= mat.row(k).start(k).squaredNorm();
+      if (x<=RealScalar(0))
+        return false;
+      mat.coeffRef(k,k) = x = ei_sqrt(x);
+      if (k>0 && rs>0) A21 -= (A20 * A10.adjoint()).lazy();
+      if (rs>0) A21 *= RealScalar(1)/x;
     }
-  }
-
-  return true;
-}
-
-template<typename MatrixType>
-bool ei_inplace_llt_up(MatrixType& mat)
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  assert(mat.rows()==mat.cols());
-  const int size = mat.rows();
-
-  const RealScalar cutoff = machine_epsilon<Scalar>() * size * mat.diagonal().cwise().abs().maxCoeff();
-  RealScalar x;
-  x = ei_real(mat.coeff(0,0));
-  mat.coeffRef(0,0) = ei_sqrt(x);
-  if(size==1)
-  {
     return true;
   }
-  mat.row(0).end(size-1) = mat.row(0).end(size-1) / ei_real(mat.coeff(0,0));
-  for (int j = 1; j < size; ++j)
+
+  template<typename MatrixType>
+  static bool blocked(MatrixType& m)
   {
-    x = ei_real(mat.coeff(j,j)) - mat.col(j).start(j).squaredNorm();
-    if (ei_abs(x) < cutoff) continue;
+    ei_assert(m.rows()==m.cols());
+    int size = m.rows();
+    if(size<32)
+      return unblocked(m);
 
-    mat.coeffRef(j,j) = x = ei_sqrt(x);
+    int blockSize = size/8;
+    blockSize = (blockSize/16)*16;
+    blockSize = std::min(std::max(blockSize,8), 128);
 
-    int endSize = size-j-1;
-    if (endSize>0) {
-      mat.row(j).end(endSize) -= (mat.col(j).start(j).adjoint() * mat.block(0, j+1, j, endSize)).lazy();
-      mat.row(j).end(endSize) *= RealScalar(1)/x;
+    for (int k=0; k<size; k+=blockSize)
+    {
+      int bs = std::min(blockSize, size-k);
+      int rs = size - k - bs;
+
+      Block<MatrixType,Dynamic,Dynamic> A11(m,k,   k,   bs,bs);
+      Block<MatrixType,Dynamic,Dynamic> A21(m,k+bs,k,   rs,bs);
+      Block<MatrixType,Dynamic,Dynamic> A22(m,k+bs,k+bs,rs,rs);
+
+      if(!unblocked(A11)) return false;
+      if(rs>0) A11.conjugate().template triangularView<LowerTriangular>().solveInPlace(A21.transpose());
+      if(rs>0) A22.template selfadjointView<LowerTriangular>().rankUpdate(A21,-1); // bottleneck
     }
+    return true;
   }
+};
 
-  return true;
-}
+template<> struct ei_llt_inplace<UpperTriangular>
+{
+  template<typename MatrixType>
+  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat)
+  {
+    Transpose<MatrixType> matt(mat);
+    return ei_llt_inplace<LowerTriangular>::unblocked(matt);
+  }
+  template<typename MatrixType>
+  static EIGEN_STRONG_INLINE bool blocked(MatrixType& mat)
+  {
+    Transpose<MatrixType> matt(mat);
+    return ei_llt_inplace<LowerTriangular>::blocked(matt);
+  }
+};
 
 template<typename MatrixType> struct LLT_Traits<MatrixType,LowerTriangular>
 {
@@ -198,7 +199,7 @@
   inline static MatrixL getL(const MatrixType& m) { return m; }
   inline static MatrixU getU(const MatrixType& m) { return m.adjoint().nestByValue(); }
   static bool inplace_decomposition(MatrixType& m)
-  { return ei_inplace_llt_lo(m); }
+  { return ei_llt_inplace<LowerTriangular>::blocked(m); }
 };
 
 template<typename MatrixType> struct LLT_Traits<MatrixType,UpperTriangular>
@@ -208,7 +209,7 @@
   inline static MatrixL getL(const MatrixType& m) { return m.adjoint().nestByValue(); }
   inline static MatrixU getU(const MatrixType& m) { return m; }
   static bool inplace_decomposition(MatrixType& m)
-  { return ei_inplace_llt_up(m); }
+  { return ei_llt_inplace<UpperTriangular>::blocked(m); }
 };
 
 /** Computes / recomputes the Cholesky decomposition A = LL^* = U^*U of \a matrix

diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 9b67dd5..15b45e4 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h

@@ -215,25 +215,25 @@
   * See TriangularView:solve() for the details.
   */
 template<typename MatrixType, unsigned int Mode>
-template<int Side, typename RhsDerived>
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<RhsDerived>& _rhs) const
+template<int Side, typename OtherDerived>
+void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
-  RhsDerived& rhs = _rhs.const_cast_derived();
+  OtherDerived& other = _other.const_cast_derived();
   ei_assert(cols() == rows());
-  ei_assert(cols() == rhs.rows());
+  ei_assert( (Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols()) );
   ei_assert(!(Mode & ZeroDiagBit));
   ei_assert(Mode & (UpperTriangularBit|LowerTriangularBit));
 
-  enum { copy = ei_traits<RhsDerived>::Flags & RowMajorBit  && RhsDerived::IsVectorAtCompileTime };
+  enum { copy = ei_traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
   typedef typename ei_meta_if<copy,
-    typename ei_plain_matrix_type_column_major<RhsDerived>::type, RhsDerived&>::ret RhsCopy;
-  RhsCopy rhsCopy(rhs);
+    typename ei_plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::ret OtherCopy;
+  OtherCopy otherCopy(other);
 
-  ei_triangular_solver_selector<MatrixType, typename ei_unref<RhsCopy>::type,
-    Side, Mode>::run(_expression(), rhsCopy);
+  ei_triangular_solver_selector<MatrixType, typename ei_unref<OtherCopy>::type,
+    Side, Mode>::run(_expression(), otherCopy);
 
   if (copy)
-    rhs = rhsCopy;
+    other = otherCopy;
 }
 
 /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.

diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 77865c4..0787daa 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h

@@ -70,7 +70,9 @@
     inline int rows() const { return m_matrix.cols(); }
     inline int cols() const { return m_matrix.rows(); }
     inline int nonZeros() const { return m_matrix.nonZeros(); }
-    inline int stride(void) const { return m_matrix.stride(); }
+    inline int stride() const { return m_matrix.stride(); }
+    inline Scalar* data() { return m_matrix.data(); }
+    inline const Scalar* data() const { return m_matrix.data(); }
 
     inline Scalar& coeffRef(int row, int col)
     {

diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index 4b3952a..e3b72f2 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp

@@ -49,52 +49,58 @@
   MatrixType matB = MatrixType::Random(rows,cols), matX(rows,cols);
   SquareMatrixType symm =  a0 * a0.adjoint();
   // let's make sure the matrix is not singular or near singular
-  MatrixType a1 = MatrixType::Random(rows,cols);
-  symm += a1 * a1.adjoint();
+  for (int k=0; k<3; ++k)
+  {
+    MatrixType a1 = MatrixType::Random(rows,cols);
+    symm += a1 * a1.adjoint();
+  }
+
+  SquareMatrixType symmUp = symm.template triangularView<UpperTriangular>();
+  SquareMatrixType symmLo = symm.template triangularView<LowerTriangular>();
 
   // to test if really Cholesky only uses the upper triangular part, uncomment the following
   // FIXME: currently that fails !!
   //symm.template part<StrictlyLowerTriangular>().setZero();
 
   #ifdef HAS_GSL
-  if (ei_is_same_type<RealScalar,double>::ret)
-  {
-    typedef GslTraits<Scalar> Gsl;
-    typename Gsl::Matrix gMatA=0, gSymm=0;
-    typename Gsl::Vector gVecB=0, gVecX=0;
-    convert<MatrixType>(symm, gSymm);
-    convert<MatrixType>(symm, gMatA);
-    convert<VectorType>(vecB, gVecB);
-    convert<VectorType>(vecB, gVecX);
-    Gsl::cholesky(gMatA);
-    Gsl::cholesky_solve(gMatA, gVecB, gVecX);
-    VectorType vecX(rows), _vecX, _vecB;
-    convert(gVecX, _vecX);
-    symm.llt().solve(vecB, &vecX);
-    Gsl::prod(gSymm, gVecX, gVecB);
-    convert(gVecB, _vecB);
-    // test gsl itself !
-    VERIFY_IS_APPROX(vecB, _vecB);
-    VERIFY_IS_APPROX(vecX, _vecX);
-
-    Gsl::free(gMatA);
-    Gsl::free(gSymm);
-    Gsl::free(gVecB);
-    Gsl::free(gVecX);
-  }
+//   if (ei_is_same_type<RealScalar,double>::ret)
+//   {
+//     typedef GslTraits<Scalar> Gsl;
+//     typename Gsl::Matrix gMatA=0, gSymm=0;
+//     typename Gsl::Vector gVecB=0, gVecX=0;
+//     convert<MatrixType>(symm, gSymm);
+//     convert<MatrixType>(symm, gMatA);
+//     convert<VectorType>(vecB, gVecB);
+//     convert<VectorType>(vecB, gVecX);
+//     Gsl::cholesky(gMatA);
+//     Gsl::cholesky_solve(gMatA, gVecB, gVecX);
+//     VectorType vecX(rows), _vecX, _vecB;
+//     convert(gVecX, _vecX);
+//     symm.llt().solve(vecB, &vecX);
+//     Gsl::prod(gSymm, gVecX, gVecB);
+//     convert(gVecB, _vecB);
+//     // test gsl itself !
+//     VERIFY_IS_APPROX(vecB, _vecB);
+//     VERIFY_IS_APPROX(vecX, _vecX);
+// 
+//     Gsl::free(gMatA);
+//     Gsl::free(gSymm);
+//     Gsl::free(gVecB);
+//     Gsl::free(gVecX);
+//   }
   #endif
 
   {
-    LLT<SquareMatrixType> chol(symm);
-    VERIFY_IS_APPROX(symm, chol.matrixL().toDense() * chol.matrixL().adjoint().toDense());
-    chol.solve(vecB, &vecX);
+    LLT<SquareMatrixType,LowerTriangular> chollo(symmLo);
+    VERIFY_IS_APPROX(symm, chollo.matrixL().toDense() * chollo.matrixL().adjoint().toDense());
+    chollo.solve(vecB, &vecX);
     VERIFY_IS_APPROX(symm * vecX, vecB);
-    chol.solve(matB, &matX);
+    chollo.solve(matB, &matX);
     VERIFY_IS_APPROX(symm * matX, matB);
 
     // test the upper mode
-    LLT<SquareMatrixType,UpperTriangular> cholup(symm);
-    VERIFY_IS_APPROX(symm, cholup.matrixL().toDense() * chol.matrixL().adjoint().toDense());
+    LLT<SquareMatrixType,UpperTriangular> cholup(symmUp);
+    VERIFY_IS_APPROX(symm, cholup.matrixL().toDense() * cholup.matrixL().adjoint().toDense());
     cholup.solve(vecB, &vecX);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     cholup.solve(matB, &matX);
@@ -147,9 +153,8 @@
     CALL_SUBTEST( cholesky(Matrix2d()) );
     CALL_SUBTEST( cholesky(Matrix3f()) );
     CALL_SUBTEST( cholesky(Matrix4d()) );
-    CALL_SUBTEST( cholesky(MatrixXcd(7,7)) );
-    CALL_SUBTEST( cholesky(MatrixXd(17,17)) );
-    CALL_SUBTEST( cholesky(MatrixXf(200,200)) );
+    CALL_SUBTEST( cholesky(MatrixXcd(100,100)) );
+    CALL_SUBTEST( cholesky(MatrixXd(200,200)) );
   }
 
   CALL_SUBTEST( cholesky_verify_assert<Matrix3f>() );