Speed up sparse x dense dot product.

diff --git a/Eigen/src/SparseCore/SparseDot.h b/Eigen/src/SparseCore/SparseDot.h
index aa876ec..f040915 100644
--- a/Eigen/src/SparseCore/SparseDot.h
+++ b/Eigen/src/SparseCore/SparseDot.h

@@ -17,7 +17,8 @@
 
 template <typename Derived>
 template <typename OtherDerived>
-typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const {
+inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(
+    const MatrixBase<OtherDerived>& other) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
   EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived)
@@ -30,17 +31,23 @@
 
   internal::evaluator<Derived> thisEval(derived());
   typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
-  Scalar res(0);
-  while (i) {
-    res += numext::conj(i.value()) * other.coeff(i.index());
+  // Two accumulators, which breaks the dependency chain on the accumulator
+  // and allows more instruction-level parallelism in the following loop.
+  Scalar res1(0);
+  Scalar res2(0);
+  for (; i; ++i) {
+    res1 += numext::conj(i.value()) * other.coeff(i.index());
     ++i;
+    if (i) {
+      res2 += numext::conj(i.value()) * other.coeff(i.index());
+    }
   }
-  return res;
+  return res1 + res2;
 }
 
 template <typename Derived>
 template <typename OtherDerived>
-typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(
+inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(
     const SparseMatrixBase<OtherDerived>& other) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)