AVX ploaddup<Packet4cf>: delegate to ploaddup<Packet4d> (2 uops vs ~5) libeigen/eigen!2455 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 3f3046f..9c6b556 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -141,10 +141,9 @@ template <> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) { - // FIXME The following might be optimized using _mm256_movedup_pd - Packet2cf a = ploaddup<Packet2cf>(from); - Packet2cf b = ploaddup<Packet2cf>(from + 1); - return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); + // Reinterpret each std::complex<float> as a double and delegate to + // ploaddup<Packet4d>, which does vbroadcastf128 + vpermilpd in two uops. + return Packet4cf(_mm256_castpd_ps(ploaddup<Packet4d>(reinterpret_cast<const double*>(from)))); } template <>