Add support for complex numbers in the generic clang backend

libeigen/eigen!2078

Co-authored-by: Rasmus Munk Larsen <rmlarsen@google.com>
diff --git a/Eigen/Core b/Eigen/Core
index 59f12eb..5f46dde 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -208,6 +208,7 @@
 #if defined(EIGEN_VECTORIZE_GENERIC) && !defined(EIGEN_DONT_VECTORIZE)
 #include "src/Core/arch/clang/PacketMath.h"
 #include "src/Core/arch/clang/TypeCasting.h"
+#include "src/Core/arch/clang/Complex.h"
 #include "src/Core/arch/clang/Reductions.h"
 #include "src/Core/arch/clang/MathFunctions.h"
 #else
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index ba38cd1..dc3e03d 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -1010,12 +1010,26 @@
   return a;
 }
 
-/** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
+/** \internal \returns \a a with real and imaginary parts flipped (for complex types only) */
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) {
   return Packet(numext::imag(a), numext::real(a));
 }
 
+/** \internal \returns \a a with real part duplicated (for complex types only) */
+// TODO(rmlarsen): Define and use in all complex backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pdupreal(const Packet& a) {
+  return Packet(numext::real(a), numext::real(a));
+}
+
+/** \internal \returns \a a with imaginary part duplicated (for complex types only) */
+// TODO(rmlarsen): Define and use in all complex backends.
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pdupimag(const Packet& a) {
+  return Packet(numext::imag(a), numext::imag(a));
+}
+
 /**************************
  * Special math functions
  ***************************/
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 827386f..96d2783 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -1130,6 +1130,20 @@
   return psincos_double<false>(x);
 }
 
+template <bool ComputeSin, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, float>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_float<ComputeSin, Packet, true>(x);
+}
+
+template <bool ComputeSin, typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+    std::enable_if_t<std::is_same<typename unpacket_traits<Packet>::type, double>::value, Packet>
+    psincos_selector(const Packet& x) {
+  return psincos_double<ComputeSin, Packet, true>(x);
+}
+
 // Generic implementation of acos(x).
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
@@ -1470,14 +1484,24 @@
 }
 
 template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y) {
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  Packet x_re = pdupreal(x);                  // a, a
+  Packet x_im = pdupimag(x);                  // b, b
+  Packet tmp_re = Packet(pmul(x_re.v, y.v));  // a*c, a*d
+  Packet tmp_im = Packet(pmul(x_im.v, y.v));  // b*c, b*d
+  tmp_im = pcplxflip(pconj(tmp_im));          // -b*d, d*c
+  return padd(tmp_im, tmp_re);                // a*c - b*d, a*d + b*c
+}
+
+template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename Scalar::value_type RealScalar;
   typedef typename unpacket_traits<Packet>::as_real RealPacket;
 
-  RealPacket real_mask_rp = peven_mask(x.v);
-  Packet real_mask(real_mask_rp);
-
   // Real part
   RealPacket x_flip = pcplxflip(x).v;  // b, a
   Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
@@ -1493,12 +1517,12 @@
   RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
   RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
 
-  Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
-  return xres;
+  return Packet(pselect(peven_mask(xreal), xreal, ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
 }
 
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
+  // FIXME(rmlarsen): This does not work correctly for Packets of std::complex<double>.
   typedef typename unpacket_traits<Packet>::as_real RealPacket;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename Scalar::value_type RealScalar;
@@ -1516,7 +1540,7 @@
   // cis(y):
   RealPacket y = pand(odd_mask, a.v);
   y = por(y, pcplxflip(Packet(y)).v);
-  RealPacket cisy = psincos_float<false, RealPacket, true>(y);
+  RealPacket cisy = psincos_selector<false, RealPacket>(y);
   cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
 
   const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index de30bcb..44ccb74 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -150,6 +150,10 @@
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y);
 
+/** \internal \returns x * y for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pmul_complex(const Packet& x, const Packet& y);
+
 template <typename Packet, int N>
 struct ppolevl;
 
diff --git a/Eigen/src/Core/arch/clang/Complex.h b/Eigen/src/Core/arch/clang/Complex.h
new file mode 100644
index 0000000..d6cc435
--- /dev/null
+++ b/Eigen/src/Core/arch/clang/Complex.h
@@ -0,0 +1,404 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Rasmus Munk Larsen
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_CLANG_H
+#define EIGEN_COMPLEX_CLANG_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename RealScalar, int N>
+struct complex_packet_wrapper {
+  using RealPacketT = detail::VectorType<RealScalar, 2 * N>;
+  EIGEN_STRONG_INLINE complex_packet_wrapper() = default;
+  EIGEN_STRONG_INLINE explicit complex_packet_wrapper(const RealPacketT& a) : v(a) {}
+  EIGEN_STRONG_INLINE constexpr std::complex<RealScalar> operator[](Index i) const {
+    return std::complex<RealScalar>(v[2 * i], v[2 * i + 1]);
+  }
+  RealPacketT v;
+};
+
+using Packet8cf = complex_packet_wrapper<float, 8>;
+using Packet4cf = complex_packet_wrapper<float, 4>;
+using Packet2cf = complex_packet_wrapper<float, 2>;
+using Packet4cd = complex_packet_wrapper<double, 4>;
+using Packet2cd = complex_packet_wrapper<double, 2>;
+
+struct generic_complex_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasArg = 0,
+    HasSetLinear = 0,
+    HasConj = 1,
+    // Math functions
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+  };
+};
+
+template <>
+struct packet_traits<std::complex<float>> : generic_complex_packet_traits {
+  using type = Packet8cf;
+  using half = Packet8cf;
+  enum {
+    size = 8,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8cf> : generic_unpacket_traits {
+  using type = std::complex<float>;
+  using half = Packet8cf;
+  using as_real = Packet16f;
+  enum {
+    size = 8,
+  };
+};
+
+template <>
+struct packet_traits<std::complex<double>> : generic_complex_packet_traits {
+  using type = Packet4cd;
+  using half = Packet4cd;
+  enum {
+    size = 4,
+    HasExp = 0,  // FIXME(rmlarsen): pexp_complex is broken for double.
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
+  using type = std::complex<double>;
+  using half = Packet4cd;
+  using as_real = Packet8d;
+  enum {
+    size = 4,
+  };
+};
+
+// ------------ Load and store ops ----------
+#define EIGEN_CLANG_COMPLEX_LOAD_STORE(PACKET_TYPE)                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type* from) {   \
+    return PACKET_TYPE(ploadu<typename unpacket_traits<PACKET_TYPE>::as_real>(&numext::real_ref(*from))); \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const unpacket_traits<PACKET_TYPE>::type* from) {    \
+    return PACKET_TYPE(pload<typename unpacket_traits<PACKET_TYPE>::as_real>(&numext::real_ref(*from)));  \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE void pstoreu<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(             \
+      typename unpacket_traits<PACKET_TYPE>::type * to, const PACKET_TYPE& from) {                        \
+    pstoreu(&numext::real_ref(*to), from.v);                                                              \
+  }                                                                                                       \
+  template <>                                                                                             \
+  EIGEN_STRONG_INLINE void pstore<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(              \
+      typename unpacket_traits<PACKET_TYPE>::type * to, const PACKET_TYPE& from) {                        \
+    pstore(&numext::real_ref(*to), from.v);                                                               \
+  }
+
+EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd);
+#undef EIGEN_CLANG_COMPLEX_LOAD_STORE
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return Packet4cd(Packet8d{re, im, re, im, re, im, re, im});
+}
+
+// ----------- Unary ops ------------------
+#define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP)                        \
+  template <>                                                             \
+  EIGEN_STRONG_INLINE PACKET_TYPE OP<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return PACKET_TYPE(OP(a.v));                                          \
+  }
+
+#define EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PACKET_TYPE)                                             \
+  DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pnegate)                                                    \
+  DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, pzero)                                                      \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type pfirst<PACKET_TYPE>(const PACKET_TYPE& a) { \
+    return a[0];                                                                                     \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pexp<PACKET_TYPE>(const PACKET_TYPE& a) {                          \
+    return pexp_complex(a);                                                                          \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE plog<PACKET_TYPE>(const PACKET_TYPE& a) {                          \
+    return plog_complex(a);                                                                          \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE psqrt<PACKET_TYPE>(const PACKET_TYPE& a) {                         \
+    return psqrt_complex(a);                                                                         \
+  }
+
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd);
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pconj<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pconj<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pconj<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pconj<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+
+#undef DELEGATE_UNARY_TO_REAL_OP
+#undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS
+
+// Flip real and imaginary parts, i.e.  {re(a), im(a)} -> {im(a), re(a)}.
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+
+// Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}.
+template <>
+EIGEN_STRONG_INLINE Packet8cf pdupreal<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdupreal<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pdupreal<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdupreal<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+
+// Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}.
+template <>
+EIGEN_STRONG_INLINE Packet8cf pdupimag<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdupimag<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pdupimag<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdupimag<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                             std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
+                             std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+  return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                             std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploadquad<Packet4cd>(const std::complex<double>* from) {
+  return pset1<Packet4cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf preverse<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(reinterpret_cast<Packet16f>(preverse(reinterpret_cast<Packet8d>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+}
+
+// ----------- Binary ops ------------------
+#define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP)                                             \
+  template <>                                                                                   \
+  EIGEN_STRONG_INLINE PACKET_TYPE OP<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return PACKET_TYPE(OP(a.v, b.v));                                                           \
+  }
+
+#define EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PACKET_TYPE)                                            \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, psub)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pand)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, por)                                                       \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pxor)                                                      \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, pandnot)                                                   \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pdiv<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
+    return pdiv_complex(a, b);                                                                       \
+  }                                                                                                  \
+  template <>                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    const PACKET_TYPE t = PACKET_TYPE(pcmp_eq(a.v, b.v));                                            \
+    return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v));                                          \
+  }
+
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
+
+// Binary ops that are needed on sub-packets for predux and predux_mul.
+#define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE)                                 \
+  DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, padd)                                                   \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmul<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return pmul_complex(a, b);                                                                    \
+  }
+
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
+
+#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE void pscatter(unpacket_traits<PACKET_TYPE>::type* to, const PACKET_TYPE& from, Index stride) { \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      to[i * stride] = from[i];                                                                                      \
+    }                                                                                                                \
+  }                                                                                                                  \
+  template <>                                                                                                        \
+  EIGEN_STRONG_INLINE PACKET_TYPE pgather<typename unpacket_traits<PACKET_TYPE>::type, PACKET_TYPE>(                 \
+      const unpacket_traits<PACKET_TYPE>::type* from, Index stride) {                                                \
+    constexpr int size = unpacket_traits<PACKET_TYPE>::size;                                                         \
+    PACKET_TYPE result;                                                                                              \
+    for (int i = 0; i < size; ++i) {                                                                                 \
+      const unpacket_traits<PACKET_TYPE>::type from_i = from[i * stride];                                            \
+      result.v[2 * i] = numext::real(from_i);                                                                        \
+      result.v[2 * i + 1] = numext::imag(from_i);                                                                    \
+    }                                                                                                                \
+    return result;                                                                                                   \
+  }
+
+EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);
+#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
+
+#undef DELEGATE_BINARY_TO_REAL_OP
+#undef EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS
+#undef EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS
+
+// ------------ ternary ops -------------
+template <>
+EIGEN_STRONG_INLINE Packet8cf pselect<Packet8cf>(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(reinterpret_cast<Packet16f>(
+      pselect(reinterpret_cast<Packet8d>(mask.v), reinterpret_cast<Packet8d>(a.v), reinterpret_cast<Packet8d>(b.v))));
+}
+
+namespace detail {
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8cf>(Packet8cf& p1, Packet8cf& p2) {
+  Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
+  p1.v = tmp;
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4cd>(Packet4cd& p1, Packet4cd& p2) {
+  Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
+  p1.v = tmp;
+}
+}  // namespace detail
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_CLANG_H
diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h
index 55ddc1c..e142264 100644
--- a/Eigen/src/Core/arch/clang/PacketMath.h
+++ b/Eigen/src/Core/arch/clang/PacketMath.h
@@ -245,28 +245,30 @@
 // --- Intrinsic-like specializations ---
 
 // --- Load/Store operations ---
-#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE, SCALAR_TYPE)                                    \
-  template <>                                                                                             \
-  EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const SCALAR_TYPE* from) {                          \
-    return detail::load_vector_unaligned<PACKET_TYPE>(from);                                              \
-  }                                                                                                       \
-  template <>                                                                                             \
-  EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const SCALAR_TYPE* from) {                           \
-    return detail::load_vector_aligned<PACKET_TYPE>(from);                                                \
-  }                                                                                                       \
-  template <>                                                                                             \
-  EIGEN_STRONG_INLINE void pstoreu<SCALAR_TYPE, PACKET_TYPE>(SCALAR_TYPE * to, const PACKET_TYPE& from) { \
-    detail::store_vector_unaligned<PACKET_TYPE>(to, from);                                                \
-  }                                                                                                       \
-  template <>                                                                                             \
-  EIGEN_STRONG_INLINE void pstore<SCALAR_TYPE, PACKET_TYPE>(SCALAR_TYPE * to, const PACKET_TYPE& from) {  \
-    detail::store_vector_aligned<PACKET_TYPE>(to, from);                                                  \
+#define EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PACKET_TYPE)                                                         \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE ploadu<PACKET_TYPE>(const detail::scalar_type_of_vector_t<PACKET_TYPE>* from) { \
+    return detail::load_vector_unaligned<PACKET_TYPE>(from);                                                      \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pload<PACKET_TYPE>(const detail::scalar_type_of_vector_t<PACKET_TYPE>* from) {  \
+    return detail::load_vector_aligned<PACKET_TYPE>(from);                                                        \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE void pstoreu<detail::scalar_type_of_vector_t<PACKET_TYPE>, PACKET_TYPE>(                    \
+      detail::scalar_type_of_vector_t<PACKET_TYPE> * to, const PACKET_TYPE& from) {                               \
+    detail::store_vector_unaligned<PACKET_TYPE>(to, from);                                                        \
+  }                                                                                                               \
+  template <>                                                                                                     \
+  EIGEN_STRONG_INLINE void pstore<detail::scalar_type_of_vector_t<PACKET_TYPE>, PACKET_TYPE>(                     \
+      detail::scalar_type_of_vector_t<PACKET_TYPE> * to, const PACKET_TYPE& from) {                               \
+    detail::store_vector_aligned<PACKET_TYPE>(to, from);                                                          \
   }
 
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f, float)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d, double)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i, int32_t)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l, int64_t)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l)
 #undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET
 
 // --- Broadcast operation ---
@@ -330,6 +332,10 @@
 // Bitwise ops for integer packets
 #define EIGEN_CLANG_PACKET_BITWISE_INT(PACKET_TYPE)                                                  \
   template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
+    return PACKET_TYPE(0);                                                                           \
+  }                                                                                                  \
+  template <>                                                                                        \
   constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
     return PACKET_TYPE(0) == PACKET_TYPE(0);                                                         \
   }                                                                                                  \
@@ -370,8 +376,9 @@
 // Bitwise ops for floating point packets
 #define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT)                    \
   template <>                                                                                        \
-  EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& a) {                         \
-    return CAST_FROM_INT(CAST_TO_INT(a) == CAST_TO_INT(a));                                          \
+  EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /* unused */) {              \
+    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
+    return CAST_FROM_INT(PACKET_TYPE(Scalar(0)) == PACKET_TYPE(Scalar(0)));                          \
   }                                                                                                  \
   template <>                                                                                        \
   EIGEN_STRONG_INLINE PACKET_TYPE pand<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {    \
@@ -537,6 +544,7 @@
 EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d)
 EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i)
 EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
+
 #undef EIGEN_CLANG_PACKET_SCATTER_GATHER
 
 // ---- Various operations that depend on __builtin_shufflevector.
@@ -653,6 +661,21 @@
   return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
+                   kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+
 // Helpers for ptranspose.
 namespace detail {
 
diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h
index d9cc558..1a6387a 100644
--- a/Eigen/src/Core/arch/clang/Reductions.h
+++ b/Eigen/src/Core/arch/clang/Reductions.h
@@ -57,54 +57,100 @@
 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {
 template <typename VectorT>
-EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceAdd16(const VectorT& a) {
-  auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) +
-            __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15);
-  auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) + __builtin_shufflevector(t1, t1, 1, 3, 5, 7);
-  auto t3 = __builtin_shufflevector(t2, t2, 0, 2) + __builtin_shufflevector(t2, t2, 1, 3);
-  return t3[0] + t3[1];
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) +
+                  __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) + __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
+  const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) + __builtin_shufflevector(t2, t2, 2, 3);
+  return {t3[0], t3[1]};
 }
 
 template <typename VectorT>
-EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceAdd8(const VectorT& a) {
-  auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) + __builtin_shufflevector(a, a, 1, 3, 5, 7);
-  auto t2 = __builtin_shufflevector(t1, t1, 0, 2) + __builtin_shufflevector(t1, t1, 1, 3);
-  return t2[0] + t2[1];
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
 }
 
 template <typename VectorT>
-EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceMul16(const VectorT& a) {
-  auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6, 8, 10, 12, 14) *
-            __builtin_shufflevector(a, a, 1, 3, 5, 7, 9, 11, 13, 15);
-  auto t2 = __builtin_shufflevector(t1, t1, 0, 2, 4, 6) * __builtin_shufflevector(t1, t1, 1, 3, 5, 7);
-  auto t3 = __builtin_shufflevector(t2, t2, 0, 2) * __builtin_shufflevector(t2, t2, 1, 3);
-  return t3[0] * t3[1];
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul16(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) *
+                  __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) * __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
+  const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
+  return {t3[0], t3[1]};
 }
 
 template <typename VectorT>
-EIGEN_STRONG_INLINE scalar_type_of_vector_t<VectorT> ReduceMul8(const VectorT& a) {
-  auto t1 = __builtin_shufflevector(a, a, 0, 2, 4, 6) * __builtin_shufflevector(a, a, 1, 3, 5, 7);
-  auto t2 = __builtin_shufflevector(t1, t1, 0, 2) * __builtin_shufflevector(t1, t1, 1, 3);
-  return t2[0] * t2[1];
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
 }
 }  // namespace detail
 
 template <>
 EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-  return detail::ReduceAdd16(a);
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd16(a);
+  return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  return detail::ReduceAdd8(a);
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd8(a);
+  return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-  return detail::ReduceMul16(a);
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul16(a);
+  return even * odd;
 }
 template <>
 EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  return detail::ReduceMul8(a);
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul8(a);
+  return even * odd;
 }
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd16(a.v);
+  return std::complex<float>(re, im);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+  double re, im;
+  std::tie(re, im) = detail::ReduceAdd8(a.v);
+  return std::complex<double>(re, im);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+  const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
+  const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
+  const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
+  const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 0, 1, 2, 3));
+  const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 4, 5, 6, 7));
+  const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+  const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
+  const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
+  const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
 #endif
 
 }  // end namespace internal