Optimization in LU::solve: when rows<=cols, no need to compute the L matrix
Remove matrixL() and matrixU() methods: they were tricky, returning a Part,
and matrixL() was useless for non-square LU. Also they were untested. This is
the occasion to simplify the docs (class_LU.cpp) removing the most confusing part.
I think that it's better to let the user do his own cooking with Part's.
diff --git a/Eigen/src/LU/LU.h b/Eigen/src/LU/LU.h
index b36dc30..00a6dcf 100644
--- a/Eigen/src/LU/LU.h
+++ b/Eigen/src/LU/LU.h
@@ -45,14 +45,9 @@
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ(). Convenience methods matrixL(), matrixU() are also provided.
   *
-  * As an exemple, here is how the original matrix can be retrieved, in the square case:
-  * \include class_LU_1.cpp
-  * Output: \verbinclude class_LU_1.out
-  *
-  * When the matrix is not square, matrixL() is no longer very useful: if one needs it, one has
-  * to construct the L matrix by hand, as shown in this example:
-  * \include class_LU_2.cpp
-  * Output: \verbinclude class_LU_2.out
+  * As an exemple, here is how the original matrix can be retrieved:
+  * \include class_LU.cpp
+  * Output: \verbinclude class_LU.out
   *
   * \sa MatrixBase::lu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse()
   */
@@ -91,6 +86,14 @@
                    MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
     > ImageResultType;
 
+    typedef Matrix<typename MatrixType::Scalar,
+                   MatrixType::RowsAtCompileTime,
+                   MatrixType::RowsAtCompileTime,
+                   MatrixType::Options,
+                   MatrixType::MaxRowsAtCompileTime,
+                   MatrixType::MaxRowsAtCompileTime
+    > MatrixLType;
+
     /** Constructor.
       *
       * \param matrix the matrix of which to compute the LU decomposition.
@@ -108,26 +111,6 @@
       return m_lu;
     }
 
-    /** \returns an expression of the unit-lower-triangular part of the LU matrix. In the square case,
-      *          this is the L matrix. In the non-square, actually obtaining the L matrix takes some
-      *          more care, see the documentation of class LU.
-      *
-      * \sa matrixLU(), matrixU()
-      */
-    inline const Part<MatrixType, UnitLowerTriangular> matrixL() const
-    {
-      return m_lu;
-    }
-
-    /** \returns an expression of the U matrix, i.e. the upper-triangular part of the LU matrix.
-      *
-      * \sa matrixLU(), matrixL()
-      */
-    inline const Part<MatrixType, UpperTriangular> matrixU() const
-    {
-      return m_lu;
-    }
-
     /** \returns a vector of integers, whose size is the number of rows of the matrix being decomposed,
       * representing the P permutation i.e. the permutation of the rows. For its precise meaning,
       * see the examples given in the documentation of class LU.
@@ -495,7 +478,7 @@
    * Step 4: result = Qd;
    */
 
-  const int rows = m_lu.rows();
+  const int rows = m_lu.rows(), cols = m_lu.cols();
   ei_assert(b.rows() == rows);
   const int smalldim = std::min(rows, m_lu.cols());
 
@@ -505,14 +488,20 @@
   for(int i = 0; i < rows; ++i) c.row(m_p.coeff(i)) = b.row(i);
 
   // Step 2
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime,
-         MatrixType::Options,
-         MatrixType::MaxRowsAtCompileTime,
-         MatrixType::MaxRowsAtCompileTime> l(rows, rows);
-  l.setZero();
-  l.corner(Eigen::TopLeft,rows,smalldim)
-    = m_lu.corner(Eigen::TopLeft,rows,smalldim);
-  l.template marked<UnitLowerTriangular>().solveTriangularInPlace(c);
+  if(rows <= cols)
+    m_lu.corner(Eigen::TopLeft,rows,smalldim).template marked<UnitLowerTriangular>().solveTriangularInPlace(c);
+  else
+  {
+    // construct the L matrix. We shouldn't do that everytime, it is a very large overhead in the case of vector solving.
+    // However the case rows>cols is rather unusual with LU so this is probably not a huge priority.
+    Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime,
+           MatrixType::Options,
+           MatrixType::MaxRowsAtCompileTime,
+           MatrixType::MaxRowsAtCompileTime> l(rows, rows);
+    l.setZero();
+    l.corner(Eigen::TopLeft,rows,smalldim) = m_lu.corner(Eigen::TopLeft,rows,smalldim);
+    l.template marked<UnitLowerTriangular>().solveTriangularInPlace(c);
+  }
 
   // Step 3
   if(!isSurjective())
diff --git a/doc/snippets/class_LU_2.cpp b/doc/snippets/class_LU.cpp
similarity index 72%
rename from doc/snippets/class_LU_2.cpp
rename to doc/snippets/class_LU.cpp
index 0c72d34..9958368 100644
--- a/doc/snippets/class_LU_2.cpp
+++ b/doc/snippets/class_LU.cpp
@@ -5,14 +5,16 @@
 Eigen::LU<Matrix5x3> lu(m);
 cout << "Here is, up to permutations, its LU decomposition matrix:"
      << endl << lu.matrixLU() << endl;
-cout << "Here is the actual L matrix in this decomposition:" << endl;
+cout << "Here is the L part:" << endl;
 Matrix5x5 l = Matrix5x5::Identity();
 l.block<5,3>(0,0).part<StrictlyLowerTriangular>() = lu.matrixLU();
 cout << l << endl;
+cout << "Here is the U part:" << endl;
+Matrix5x3 u = lu.matrixLU().part<UpperTriangular>();
+cout << u << endl;
 cout << "Let us now reconstruct the original matrix m:" << endl;
-Matrix5x3 x = l * lu.matrixU();
+Matrix5x3 x = l * u;
 Matrix5x3 y;
 for(int i = 0; i < 5; i++) for(int j = 0; j < 3; j++)
   y(i, lu.permutationQ()[j]) = x(lu.permutationP()[i], j);
-cout << y << endl;
-assert(y.isApprox(m));
+cout << y << endl; // should be equal to the original matrix m
diff --git a/doc/snippets/class_LU_1.cpp b/doc/snippets/class_LU_1.cpp
deleted file mode 100644
index 50cfc4b..0000000
--- a/doc/snippets/class_LU_1.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-Matrix3d m = Matrix3d::Random();
-cout << "Here is the matrix m:" << endl << m << endl;
-Eigen::LU<Matrix3d> lu(m);
-cout << "Here is, up to permutations, its LU decomposition matrix:"
-     << endl << lu.matrixLU() << endl;
-cout << "Let us now reconstruct the original matrix m from it:" << endl;
-Matrix3d x = lu.matrixL() * lu.matrixU();
-Matrix3d y;
-for(int i = 0; i < 3; i++) for(int j = 0; j < 3; j++)
-  y(i, lu.permutationQ()[j]) = x(lu.permutationP()[i], j);
-cout << y << endl;
-assert(y.isApprox(m));