AVX ploaddup<Packet4cf>: delegate to ploaddup<Packet4d> (2 uops vs ~5)

libeigen/eigen!2455

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 3f3046f..9c6b556 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -141,10 +141,9 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) {
-  // FIXME The following might be optimized using _mm256_movedup_pd
-  Packet2cf a = ploaddup<Packet2cf>(from);
-  Packet2cf b = ploaddup<Packet2cf>(from + 1);
-  return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+  // Reinterpret each std::complex<float> as a double and delegate to
+  // ploaddup<Packet4d>, which does vbroadcastf128 + vpermilpd in two uops.
+  return Packet4cf(_mm256_castpd_ps(ploaddup<Packet4d>(reinterpret_cast<const double*>(from))));
 }
 
 template <>