Speed up complex * complex matrix multiplication.
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 1b7861a..b65c246 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -718,10 +718,10 @@
     LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
     RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
     RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
-    // FIXME: should depend on NumberOfRegisters
     nr = 4,
-    mr = ResPacketSize,
+    mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
 
     LhsProgress = ResPacketSize,
     RhsProgress = 1
@@ -795,8 +795,8 @@
                                                                                          DoublePacket<ResPacketType>& c,
                                                                                          TmpType& /*tmp*/,
                                                                                          const LaneIdType&) const {
-    c.first = padd(pmul(a, b.first), c.first);
-    c.second = padd(pmul(a, b.second), c.second);
+    c.first = pmadd(a, b.first, c.first);
+    c.second = pmadd(a, b.second, c.second);
   }
 
   template <typename LaneIdType>