Add a *very efficient* evaluation path for both col-major matrix * vector
and vector * row-major products. Currently, it is enabled only is the matrix
has DirectAccessBit flag and the product is "large enough".
Added the respective unit tests in test/product/cpp.
diff --git a/Eigen/Core b/Eigen/Core
index 333e80a..6c6de3a 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -59,10 +59,7 @@
 #include "src/Core/CommaInitializer.h"
 #include "src/Core/Extract.h"
 #include "src/Core/Part.h"
-
-#ifndef EIGEN_EXTERN_INSTANTIATIONS
 #include "src/Core/CacheFriendlyProduct.h"
-#endif
 
 } // namespace Eigen
 
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 828b497..ba53e29 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -249,7 +249,6 @@
 {
   static void run(Derived1 &dst, const Derived2 &src)
   {
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
     const int innerSize = dst.innerSize();
     const int outerSize = dst.outerSize();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 2453575..ae4e83c 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -155,7 +155,6 @@
       return m_matrix.const_cast_derived()
              .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-
     }
 
     inline const Scalar coeff(int index) const
diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h
index 0da84ee..a710d44 100644
--- a/Eigen/src/Core/CacheFriendlyProduct.h
+++ b/Eigen/src/Core/CacheFriendlyProduct.h
@@ -25,6 +25,8 @@
 #ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
 #define EIGEN_CACHE_FRIENDLY_PRODUCT_H
 
+#ifndef EIGEN_EXTERN_INSTANTIATIONS
+
 template<typename Scalar>
 static void ei_cache_friendly_product(
   int _rows, int _cols, int depth,
@@ -77,8 +79,6 @@
     MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
   };
 
-
-  //const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0));
   const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
 
   const int remainingSize = depth % PacketSize;
@@ -357,4 +357,165 @@
     free(block);
 }
 
+#endif // EIGEN_EXTERN_INSTANTIATIONS
+
+/* Optimized col-major matrix * vector product:
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
+ * TODO: since rhs gets evaluated only once, no need to evaluate it
+ */
+template<typename Scalar, typename RhsType>
+EIGEN_DONT_INLINE static void ei_cache_friendly_product(
+  int size,
+  const Scalar* lhs, int lhsStride,
+  const RhsType& rhs,
+  Scalar* res)
+{
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+
+  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2,OFFSET) \
+    ei_pstore(&res[j OFFSET], \
+      ei_padd(ei_pload(&res[j OFFSET]), \
+        ei_padd( \
+          ei_padd(ei_pmul(ptmp0,ei_pload ## A0(&lhs[j OFFSET +iN0])),ei_pmul(ptmp1,ei_pload ## A13(&lhs[j OFFSET +iN1]))), \
+          ei_padd(ei_pmul(ptmp2,ei_pload ## A2(&lhs[j OFFSET +iN2])),ei_pmul(ptmp3,ei_pload ## A13(&lhs[j OFFSET +iN3]))) )))
+
+  asm("#begin matrix_vector_product");
+  typedef typename ei_packet_traits<Scalar>::type Packet;
+  const int PacketSize = sizeof(Packet)/sizeof(Scalar);
+
+  enum { AllAligned, EvenAligned, FirstAligned, NoneAligned };
+  const int columnsAtOnce = 4;
+  const int peels = 2;
+  const int PacketAlignedMask = PacketSize-1;
+  const int PeelAlignedMask = PacketSize*peels-1;
+  const bool Vectorized = sizeof(Packet) != sizeof(Scalar);
+
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
+  const int alignedStart = Vectorized
+                         ? std::min<int>( (PacketSize - ((size_t(res)/sizeof(Scalar)) & PacketAlignedMask)) & PacketAlignedMask, size)
+                         : 0;
+  const int alignedSize = alignedStart + ((size-alignedStart) & ~PacketAlignedMask);
+  const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : 0;
+
+  const int alignmentStep = lhsStride % PacketSize;
+  int alignmentPattern = alignmentStep==0 ? AllAligned
+                       : alignmentStep==2 ? EvenAligned
+                       : FirstAligned;
+
+  // find how many column do we have to skip to be aligned with the result (if possible)
+  int skipColumns=0;
+  for (; skipColumns<PacketSize; ++skipColumns)
+  {
+    if (alignedStart == alignmentStep*skipColumns)
+      break;
+  }
+  if (skipColumns==PacketSize)
+    alignmentPattern = NoneAligned;
+  skipColumns = std::min(skipColumns,rhs.size());
+  if (alignmentPattern!=NoneAligned)
+    for (int i=0; i<skipColumns; i++)
+    {
+      Scalar tmp0 = rhs[i];
+      Packet ptmp0 = ei_pset1(tmp0);
+      int iN0 = i*lhsStride;
+      // process first unaligned result's coeffs
+      for (int j=0; j<alignedStart; j++)
+        res[j] += tmp0 * lhs[j+iN0];
+      // process aligned result's coeffs (we know the lhs columns are not aligned)
+      for (int j = alignedStart;j<alignedSize;j+=PacketSize)
+        ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+      // process remaining result's coeffs
+      for (int j=alignedSize; j<size; j++)
+        res[j] += tmp0 * lhs[j+iN0];
+    }
+
+  int columnBound = (rhs.size()/columnsAtOnce)*columnsAtOnce;
+  for (int i=0; i<columnBound; i+=columnsAtOnce)
+  {
+    Scalar tmp0 = rhs[i];
+    Packet ptmp0 = ei_pset1(tmp0);
+    Scalar tmp1 = rhs[i+1];
+    Packet ptmp1 = ei_pset1(tmp1);
+    Scalar tmp2 = rhs[i+2];
+    Packet ptmp2 = ei_pset1(tmp2);
+    Scalar tmp3 = rhs[i+3];
+    Packet ptmp3 = ei_pset1(tmp3);
+    int iN0 = i*lhsStride;
+    int iN1 = (i+1)*lhsStride;
+    int iN2 = (i+2)*lhsStride;
+    int iN3 = (i+3)*lhsStride;
+
+    // process initial unaligned coeffs
+    for (int j=0; j<alignedStart; j++)
+      res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+
+    if (alignedSize>0)
+    {
+      switch(alignmentPattern)
+      {
+        case AllAligned:
+          for (int j = alignedStart; j<alignedSize; j+=PacketSize)
+            _EIGEN_ACCUMULATE_PACKETS(,,,);
+          break;
+        case EvenAligned:
+          for (int j = alignedStart; j<alignedSize; j+=PacketSize)
+            _EIGEN_ACCUMULATE_PACKETS(,u,,);
+          break;
+        case FirstAligned:
+          if (peels>1)
+            for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize)
+            {
+              _EIGEN_ACCUMULATE_PACKETS(,u,u,);
+              _EIGEN_ACCUMULATE_PACKETS(,u,u,+PacketSize);
+              if (peels>2) _EIGEN_ACCUMULATE_PACKETS(,u,u,+2*PacketSize);
+              if (peels>3) _EIGEN_ACCUMULATE_PACKETS(,u,u,+3*PacketSize);
+              if (peels>4) _EIGEN_ACCUMULATE_PACKETS(,u,u,+4*PacketSize);
+              if (peels>5) _EIGEN_ACCUMULATE_PACKETS(,u,u,+5*PacketSize);
+              if (peels>6) _EIGEN_ACCUMULATE_PACKETS(,u,u,+6*PacketSize);
+              if (peels>7) _EIGEN_ACCUMULATE_PACKETS(,u,u,+7*PacketSize);
+            }
+          for (int j = peeledSize; j<alignedSize; j+=PacketSize)
+            _EIGEN_ACCUMULATE_PACKETS(,u,u,);
+          break;
+        default:
+          for (int j = peeledSize; j<alignedSize; j+=PacketSize)
+            _EIGEN_ACCUMULATE_PACKETS(u,u,u,);
+          break;
+      }
+    }
+
+    // process remaining coeffs
+    for (int j=alignedSize; j<size; j++)
+      res[j] += tmp0 * lhs[j+iN0] + tmp1 * lhs[j+iN1] + tmp2 * lhs[j+iN2] + tmp3 * lhs[j+iN3];
+  }
+  for (int i=columnBound; i<rhs.size(); i++)
+  {
+    Scalar tmp0 = rhs[i];
+    Packet ptmp0 = ei_pset1(tmp0);
+    int iN0 = i*lhsStride;
+    if (alignedSize>0)
+    {
+      bool aligned0 = (iN0 % PacketSize) == 0;
+      if (aligned0)
+        for (int j = 0;j<alignedSize;j+=PacketSize)
+          ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_pload(&lhs[j+iN0])),ei_pload(&res[j])));
+      else
+        for (int j = 0;j<alignedSize;j+=PacketSize)
+          ei_pstore(&res[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&lhs[j+iN0])),ei_pload(&res[j])));
+    }
+    // process remaining scalars
+    for (int j=alignedSize; j<size; j++)
+      res[j] += tmp0 * lhs[j+iN0];
+  }
+  asm("#end matrix_vector_product");
+
+  #undef _EIGEN_ACCUMULATE_PACKETS
+}
+
 #endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 6089c1b..c2f0c07 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -250,8 +250,8 @@
       return res;
     }
 
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
+    inline const _LhsNested& lhs() const { return m_lhs; }
+    inline const _RhsNested& rhs() const { return m_rhs; }
 
   protected:
     const LhsNested m_lhs;
@@ -480,11 +480,22 @@
 * Cache friendly product callers and specific nested evaluation strategies
 ***************************************************************************/
 
+template<typename Scalar, typename RhsType>
+static void ei_cache_friendly_product(
+  int size, const Scalar* lhs, int lhsStride, const RhsType& rhs, Scalar* res);
+
+enum {
+  HasDirectAccess,
+  NoDirectAccess
+};
+
 template<typename ProductType,
   int LhsRows  = ei_traits<ProductType>::RowsAtCompileTime,
   int LhsOrder = int(ei_traits<ProductType>::LhsFlags)&RowMajorBit ? RowMajor : ColMajor,
+  int LhsHasDirectAccess = int(ei_traits<ProductType>::LhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess,
   int RhsCols  = ei_traits<ProductType>::ColsAtCompileTime,
-  int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor>
+  int RhsOrder = int(ei_traits<ProductType>::RhsFlags)&RowMajorBit ? RowMajor : ColMajor,
+  int RhsHasDirectAccess = int(ei_traits<ProductType>::RhsFlags)&DirectAccessBit? HasDirectAccess : NoDirectAccess>
 struct ei_cache_friendly_product_selector
 {
   template<typename DestDerived>
@@ -495,21 +506,57 @@
 };
 
 // optimized colmajor * vector path
-template<typename ProductType, int LhsRows, int RhsOrder>
-struct ei_cache_friendly_product_selector<ProductType,LhsRows,ColMajor,1,RhsOrder>
+template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,NoDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
 {
+  typedef typename ei_traits<ProductType>::_LhsNested Lhs;
   template<typename DestDerived>
   inline static void run(DestDerived& res, const ProductType& product)
   {
-    const int rows = product.rhs().rows();
-    for (int j=0; j<rows; ++j)
-      res += product.rhs().coeff(j) * product.lhs().col(j);
+    ei_scalar_sum_op<typename ProductType::Scalar> _sum;
+    const int size = product.rhs().rows();
+    for (int k=0; k<size; ++k)
+      if (Lhs::Flags&DirectAccessBit)
+        // TODO to properly hanlde this workaround let's specialize Block for type having the DirectAccessBit
+        res += product.rhs().coeff(k) * Map<DestDerived>(&product.lhs().const_cast_derived().coeffRef(0,k),product.lhs().rows());
+      else
+        res += product.rhs().coeff(k) * product.lhs().col(k);
+  }
+};
+
+// optimized cache friendly colmajor * vector path for matrix with direct access flag
+// NOTE this path coul also be enabled for expressions if we add runtime align queries
+template<typename ProductType, int LhsRows, int RhsOrder, int RhsAccess>
+struct ei_cache_friendly_product_selector<ProductType,LhsRows,HasDirectAccess,ColMajor,1,RhsOrder,RhsAccess>
+{
+  typedef typename ProductType::Scalar Scalar;
+
+  template<typename DestDerived>
+  inline static void run(DestDerived& res, const ProductType& product)
+  {
+    enum {
+      EvalToRes = (ei_packet_traits<Scalar>::size==1)
+                ||(DestDerived::Flags&ActualPacketAccessBit) && (!(DestDerived::Flags & RowMajorBit)) };
+    Scalar* __restrict__ _res;
+    if (EvalToRes)
+       _res = &res.coeffRef(0);
+    else
+    {
+      _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
+      Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
+    }
+    ei_cache_friendly_product(res.size(),
+      &product.lhs().const_cast_derived().coeffRef(0,0), product.lhs().stride(),
+      product.rhs(), _res);
+
+    if (!EvalToRes)
+      res = Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size());
   }
 };
 
 // optimized vector * rowmajor path
-template<typename ProductType, int LhsOrder, int RhsCols>
-struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,RhsCols,RowMajor>
+template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
+struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,NoDirectAccess>
 {
   template<typename DestDerived>
   inline static void run(DestDerived& res, const ProductType& product)
@@ -520,6 +567,36 @@
   }
 };
 
+// optimized cache friendly vector * rowmajor path for matrix with direct access flag
+// NOTE this path coul also be enabled for expressions if we add runtime align queries
+template<typename ProductType, int LhsOrder, int LhsAccess, int RhsCols>
+struct ei_cache_friendly_product_selector<ProductType,1,LhsOrder,LhsAccess,RhsCols,RowMajor,HasDirectAccess>
+{
+  typedef typename ProductType::Scalar Scalar;
+
+  template<typename DestDerived>
+  inline static void run(DestDerived& res, const ProductType& product)
+  {
+    enum {
+      EvalToRes = (ei_packet_traits<Scalar>::size==1)
+                ||(DestDerived::Flags & ActualPacketAccessBit) && (DestDerived::Flags & RowMajorBit) };
+    Scalar* __restrict__ _res;
+    if (EvalToRes)
+       _res = &res.coeffRef(0);
+    else
+    {
+      _res = (Scalar*)alloca(sizeof(Scalar)*res.size());
+      Map<Matrix<Scalar,DestDerived::RowsAtCompileTime,1> >(_res, res.size()) = res;
+    }
+    ei_cache_friendly_product(res.size(),
+      &product.rhs().const_cast_derived().coeffRef(0,0), product.rhs().stride(),
+      product.lhs().transpose(), _res);
+
+    if (!EvalToRes)
+      res = Map<Matrix<Scalar,1,DestDerived::ColsAtCompileTime> >(_res, res.size());
+  }
+};
+
 /** \internal */
 template<typename Derived>
 template<typename Lhs,typename Rhs>
diff --git a/test/product.cpp b/test/product.cpp
index ffc845c..50ec64d 100644
--- a/test/product.cpp
+++ b/test/product.cpp
@@ -39,9 +39,13 @@
 
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::FloatingPoint FloatingPoint;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> RowVectorType;
+  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> ColVectorType;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> RowSquareMatrixType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> ColSquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
+                         MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
+                         MatrixType::Flags&RowMajorBit ? 0 : RowMajorBit> OtherMajorMatrixType;
 
   int rows = m.rows();
   int cols = m.cols();
@@ -59,9 +63,11 @@
   ColSquareMatrixType
              square2 = ColSquareMatrixType::random(cols, cols),
              res2 = ColSquareMatrixType::random(cols, cols);
-  VectorType v1 = VectorType::random(rows),
-             v2 = VectorType::random(rows),
-             vzero = VectorType::zero(rows);
+  RowVectorType v1 = RowVectorType::random(rows),
+             v2 = RowVectorType::random(rows),
+             vzero = RowVectorType::zero(rows);
+  ColVectorType vc2 = ColVectorType::random(cols), vcres;
+  OtherMajorMatrixType tm1 = m1;
 
   Scalar s1 = ei_random<Scalar>();
 
@@ -89,6 +95,7 @@
 
   // test Product.h together with Identity.h
   VERIFY_IS_APPROX(v1,                      identity*v1);
+  VERIFY_IS_APPROX(v1.transpose(),          v1.transpose() * identity);
   // again, test operator() to check const-qualification
   VERIFY_IS_APPROX(MatrixType::identity(rows, cols)(r,c), static_cast<Scalar>(r==c));
 
@@ -110,6 +117,21 @@
   {
     VERIFY(areNotApprox(res,square + m2 * m1.transpose()));
   }
+  vcres = vc2;
+  vcres += (m1.transpose() * v1).lazy();
+  VERIFY_IS_APPROX(vcres, vc2 + m1.transpose() * v1);
+  tm1 = m1;
+  VERIFY_IS_APPROX(tm1.transpose() * v1, m1.transpose() * v1);
+  VERIFY_IS_APPROX(v1.transpose() * tm1, v1.transpose() * m1);
+
+  // test submatrix and matrix/vector product
+  for (int i=0; i<rows; ++i)
+    res.row(i) = m1.row(i) * m2.transpose();
+  VERIFY_IS_APPROX(res, m1 * m2.transpose());
+  // the other way round:
+  for (int i=0; i<rows; ++i)
+    res.col(i) = m1 * m2.transpose().col(i);
+  VERIFY_IS_APPROX(res, m1 * m2.transpose());
 
   res2 = square2;
   res2 += (m1.transpose() * m2).lazy();