Small speed-up in row-major sparse dense product

commit: e939c06b0e54fd7c4bfa173d01b47d2554bf7a85 [log] [tgz]
author: Erik Schultheis <erik.schultheis@aalto.fi> Wed Dec 15 18:46:25 2021 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> Wed Dec 15 18:46:25 2021 +0000
tree: d03d636d0cfddb6328410a3ea3c1ddac85b95be8
parent: 2d39da8af54982a02e62adec26ea0fb7425e5cb0 [diff]
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 878a759..6f433fc 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h

@@ -65,10 +65,18 @@
   
   static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
   {
-    typename Res::Scalar tmp(0);
-    for(LhsInnerIterator it(lhsEval,i); it ;++it)
-      tmp += it.value() * rhs.coeff(it.index(),col);
-    res.coeffRef(i,col) += alpha * tmp;
+    // Two accumulators, which breaks the dependency chain on the accumulator
+    // and allows more instruction-level parallelism in the following loop
+    typename Res::Scalar tmp_a(0);
+    typename Res::Scalar tmp_b(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it) {
+      tmp_a += it.value() * rhs.coeff(it.index(), col);
+      ++it;
+      if(it) {
+        tmp_b += it.value() * rhs.coeff(it.index(), col);
+      }
+    }
+    res.coeffRef(i, col) += alpha * (tmp_a + tmp_b);
   }
   
 };
commit	e939c06b0e54fd7c4bfa173d01b47d2554bf7a85	[log] [tgz]
author	Erik Schultheis <erik.schultheis@aalto.fi>	Wed Dec 15 18:46:25 2021 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Dec 15 18:46:25 2021 +0000
tree	d03d636d0cfddb6328410a3ea3c1ddac85b95be8
parent	2d39da8af54982a02e62adec26ea0fb7425e5cb0 [diff]