fix computation of blocking sizes for small triangular matrices
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index cf48ca2..7e42eed 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -101,7 +101,7 @@
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar>
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
{
// Explanations:
@@ -114,7 +114,7 @@
std::ptrdiff_t l1, l2;
enum {
- kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr
+ kdiv = KcFactor * 2 * ei_product_blocking_traits<RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
mr = ei_product_blocking_traits<LhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
@@ -127,6 +127,12 @@
n = n;
}
+template<typename LhsScalar, typename RhsScalar>
+inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
+{
+ computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+}
+
#ifdef EIGEN_HAS_FUSE_CJMADD
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
#else
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index decf515..9796096 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -117,9 +117,7 @@
Index kc = depth; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
- // it is better to use smaller blocks along the diagonal
- kc /= 4;
+ computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -245,9 +243,7 @@
Index kc = depth; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
- // it is better to use smaller blocks along the diagonal
- kc /= 4;
+ computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 3819834..4723d35 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -66,9 +66,7 @@
Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
- // it is better to use smaller blocks along the diagonal
- kc /= 4;
+ computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@@ -206,9 +204,7 @@
Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = rows; // cache block size along the N direction
- computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
- // it is better to use smaller blocks along the diagonal
- kc /= 4;
+ computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;