Small speed-up in row-major sparse dense product
diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h
index 878a759..6f433fc 100644
--- a/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -65,10 +65,18 @@
static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
{
- typename Res::Scalar tmp(0);
- for(LhsInnerIterator it(lhsEval,i); it ;++it)
- tmp += it.value() * rhs.coeff(it.index(),col);
- res.coeffRef(i,col) += alpha * tmp;
+ // Two accumulators, which breaks the dependency chain on the accumulator
+ // and allows more instruction-level parallelism in the following loop
+ typename Res::Scalar tmp_a(0);
+ typename Res::Scalar tmp_b(0);
+ for(LhsInnerIterator it(lhsEval,i); it ;++it) {
+ tmp_a += it.value() * rhs.coeff(it.index(), col);
+ ++it;
+ if(it) {
+ tmp_b += it.value() * rhs.coeff(it.index(), col);
+ }
+ }
+ res.coeffRef(i, col) += alpha * (tmp_a + tmp_b);
}
};