Apply clang-format
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 238edc8..a5e6499 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -18,16 +18,15 @@
 namespace internal {
 
 //---------- float ----------
-struct Packet4cf
-{
+struct Packet4cf {
   EIGEN_STRONG_INLINE Packet4cf() {}
   EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
-  __m256  v;
+  __m256 v;
 };
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet4cf type;
   typedef Packet2cf half;
   enum {
@@ -35,50 +34,58 @@
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet4cf> {
+template <>
+struct unpacket_traits<Packet4cf> {
   typedef std::complex<float> type;
   typedef Packet2cf half;
   typedef Packet8f as_real;
   enum {
-    size=4,
-    alignment=Aligned32,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) {
   return Packet4cf(pnegate(a.v));
 }
-template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
-{
-  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet4cf(_mm256_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+                                                            0x80000000, 0x00000000, 0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
   __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
-  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
   __m256 result = _mm256_addsub_ps(tmp1, tmp2);
   return Packet4cf(result);
 }
@@ -89,112 +96,135 @@
   return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(ptrue(Packet8f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pand<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf por<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pxor<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_andnot_ps(b.v, a.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from)));
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) {
   const float re = std::real(from);
   const float im = std::imag(from);
   return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) {
   // FIXME The following might be optimized using _mm256_movedup_pd
   Packet2cf a = ploaddup<Packet2cf>(from);
-  Packet2cf b = ploaddup<Packet2cf>(from+1);
-  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+  Packet2cf b = ploaddup<Packet2cf>(from + 1);
+  return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
-                                 std::imag(from[2*stride]), std::real(from[2*stride]),
-                                 std::imag(from[1*stride]), std::real(from[1*stride]),
-                                 std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet4cf(_mm256_set_ps(std::imag(from[3 * stride]), std::real(from[3 * stride]), std::imag(from[2 * stride]),
+                                 std::real(from[2 * stride]), std::imag(from[1 * stride]), std::real(from[1 * stride]),
+                                 std::imag(from[0 * stride]), std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from,
+                                                                       Index stride) {
   __m128 low = _mm256_extractf128_ps(from.v, 0);
-  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
-  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+  to[stride * 0] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride * 1] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
 
   __m128 high = _mm256_extractf128_ps(from.v, 1);
-  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
-  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
-
+  to[stride * 2] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride * 3] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) {
   return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
-  __m128 low  = _mm256_extractf128_ps(a.v, 0);
+template <>
+EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low = _mm256_extractf128_ps(a.v, 0);
   __m128 high = _mm256_extractf128_ps(a.v, 1);
-  __m128d lowd  = _mm_castps_pd(low);
+  __m128d lowd = _mm_castps_pd(low);
   __m128d highd = _mm_castps_pd(high);
-  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
-  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+  low = _mm_castpd_ps(_mm_shuffle_pd(lowd, lowd, 0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd, highd, 0x1));
   __m256 result = _mm256_setzero_ps();
   result = _mm256_insertf128_ps(result, low, 1);
   result = _mm256_insertf128_ps(result, high, 0);
   return Packet4cf(result);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
-{
-  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
-                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) {
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
-{
-  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
-                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) {
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf, Packet8f)
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
-
-template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
-{
-  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) {
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
 }
 
 //---------- double ----------
-struct Packet2cd
-{
+struct Packet2cd {
   EIGEN_STRONG_INLINE Packet2cd() {}
   EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
-  __m256d  v;
+  __m256d v;
 };
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet2cd type;
   typedef Packet1cd half;
   enum {
@@ -202,50 +232,60 @@
     AlignedOnScalar = 0,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet2cd> {
+template <>
+struct unpacket_traits<Packet2cd> {
   typedef std::complex<double> type;
   typedef Packet1cd half;
   typedef Packet4d as_real;
   enum {
-    size=2,
-    alignment=Aligned32,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    size = 2,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
-{
-  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet2cd(_mm256_xor_pd(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) {
+  return Packet2cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  return Packet2cd(_mm256_xor_pd(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
-  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  __m256d tmp1 = _mm256_shuffle_pd(a.v, a.v, 0x0);
   __m256d even = _mm256_mul_pd(tmp1, b.v);
-  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
-  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
-  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
+  __m256d tmp2 = _mm256_shuffle_pd(a.v, a.v, 0xF);
+  __m256d tmp3 = _mm256_shuffle_pd(b.v, b.v, 0x5);
+  __m256d odd = _mm256_mul_pd(tmp2, tmp3);
   return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 
@@ -255,82 +295,110 @@
   return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(ptrue(Packet4d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pand<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd por<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pxor<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_andnot_pd(b.v, a.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) {
   // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
-//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
-    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+  //   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+  return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
-{
-  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
-				 std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) {
+  return pset1<Packet2cd>(*from);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from,
+                                                                            Index stride) {
+  return Packet2cd(_mm256_set_pd(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+                                 std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+                                                                        Index stride) {
   __m128d low = _mm256_extractf128_pd(from.v, 0);
-  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  to[stride * 0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
   __m128d high = _mm256_extractf128_pd(from.v, 1);
-  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+  to[stride * 1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) {
   __m128d low = _mm256_extractf128_pd(a.v, 0);
   EIGEN_ALIGN16 double res[2];
   _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
+  return std::complex<double>(res[0], res[1]);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
   __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
   return Packet2cd(result);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
-{
-  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) {
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
-{
-  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
-                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) {
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd, Packet4d)
 
-template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) {
   return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cf, 4>& kernel) {
   __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
   __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
   __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
@@ -347,23 +415,24 @@
   kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cd,2>& kernel) {
-  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
-  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
- kernel.packet[0].v = tmp;
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0 + (2 << 4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1 + (3 << 4));
+  kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
   return psqrt_complex<Packet2cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
   return psqrt_complex<Packet4cf>(a);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_AVX_H
+#endif  // EIGEN_COMPLEX_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 6e83cfc..b125d59 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -28,20 +28,19 @@
 // iteration for square root. In particular, Skylake and Zen2 processors
 // have approximately doubled throughput of the _mm_sqrt_ps instruction
 // compared to their predecessors.
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f psqrt<Packet8f>(const Packet8f& _x) {
   return _mm256_sqrt_ps(_x);
 }
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psqrt<Packet4d>(const Packet4d& _x) {
   return _mm256_sqrt_pd(_x);
 }
 
-
 // Even on Skylake, using Newton iteration is a win for reciprocal square root.
 #if EIGEN_FAST_MATH
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet8f prsqrt<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f prsqrt<Packet8f>(const Packet8f& a) {
   // _mm256_rsqrt_ps returns -inf for negative denormals.
   // _mm512_rsqrt**_ps returns -NaN for negative denormals.  We may want
   // consistency here.
@@ -51,7 +50,8 @@
   return generic_rsqrt_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rsqrt_ps(a));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
   return generic_reciprocal_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rcp_ps(a));
 }
 
@@ -106,7 +106,6 @@
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
 
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 6f37ba0..d752f06 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -31,7 +31,7 @@
 #endif
 #endif
 
-typedef __m256  Packet8f;
+typedef __m256 Packet8f;
 typedef eigen_packet_wrapper<__m256i, 0> Packet8i;
 typedef __m256d Packet4d;
 #ifndef EIGEN_VECTORIZE_AVX512FP16
@@ -46,31 +46,58 @@
 typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
 #endif
 
-template<> struct is_arithmetic<__m256>  { enum { value = true }; };
-template<> struct is_arithmetic<__m256i> { enum { value = true }; };
-template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet8i> { enum { value = true }; };
+template <>
+struct is_arithmetic<__m256> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8i> {
+  enum { value = true };
+};
 // Note that `Packet8ui` uses the underlying type `__m256i`, which is
 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
 // operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet8ui> { enum { value = false }; };
+template <>
+struct is_arithmetic<Packet8ui> {
+  enum { value = false };
+};
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet8h> {
+  enum { value = true };
+};
 #endif
-template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet8bf> {
+  enum { value = true };
+};
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> struct is_arithmetic<Packet4l> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4l> {
+  enum { value = true };
+};
 // Note that `Packet4ul` uses the underlying type `__m256i`, which is
 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
 // operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet4ul> { enum { value = false }; };
+template <>
+struct is_arithmetic<Packet4ul> {
+  enum { value = false };
+};
 #endif
 
 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
 // to leverage AVX512 instructions.
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<float>  : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet8f type;
   typedef Packet4f half;
   enum {
@@ -78,7 +105,7 @@
     AlignedOnScalar = 1,
     size = 8,
 
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
@@ -104,19 +131,19 @@
     HasRint = 1
   };
 };
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet4d type;
   typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
 
-    HasCmp  = 1,
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasATan = 1,
@@ -138,35 +165,35 @@
     AlignedOnScalar = 1,
     size = 8,
 
-    HasCmp    = 1,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasSin    = EIGEN_FAST_MATH,
-    HasCos    = EIGEN_FAST_MATH,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
     HasNegate = 1,
-    HasAbs    = 1,
-    HasAbs2   = 0,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 0,
-    HasLog    = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
-    HasExp    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasTanh   = EIGEN_FAST_MATH,
-    HasErf    = EIGEN_FAST_MATH,
-    HasBlend  = 0,
-    HasRound  = 1,
-    HasFloor  = 1,
-    HasCeil   = 1,
-    HasRint   = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
     HasBessel = 1,
-    HasNdtri  = 1
+    HasNdtri = 1
   };
 };
 
@@ -189,15 +216,15 @@
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasNegate = 1,
-    HasAbs    = 1,
-    HasAbs2   = 0,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 0,
     HasLog = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
@@ -209,24 +236,18 @@
     HasCeil = 1,
     HasRint = 1,
     HasBessel = 1,
-    HasNdtri  = 1
+    HasNdtri = 1
   };
 };
 
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet8i type;
   typedef Packet4i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    HasCmp = 1,
-    HasDiv = 1,
-    size=8
-  };
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 8 };
 };
-template<> struct packet_traits<uint32_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
   typedef Packet8ui type;
   typedef Packet4ui half;
   enum {
@@ -246,21 +267,16 @@
 };
 
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> struct packet_traits<int64_t> : default_packet_traits
-{
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
   typedef Packet4l type;
   // There is no half-size packet for current Packet4l.
   // TODO: support as SSE path.
   typedef Packet4l half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    HasCmp = 1,
-    size=4
-  };
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
 };
-template<> struct packet_traits<uint64_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
   typedef Packet4ul type;
   // There is no half-size packet for current Packet4ul.
   // TODO: support as SSE path.
@@ -285,51 +301,106 @@
 
 #endif
 
-template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
-template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+template <>
+struct scalar_div_cost<float, true> {
+  enum { value = 14 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+  enum { value = 16 };
+};
 
-template<> struct unpacket_traits<Packet8f> {
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet8i  integer_packet;
-  typedef uint8_t   mask_t;
-  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true
+template <>
+struct unpacket_traits<Packet8f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet8i integer_packet;
+  typedef uint8_t mask_t;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true
 #ifdef EIGEN_VECTORIZE_AVX512
-    , masked_fpops_available=true
+    ,
+    masked_fpops_available = true
 #endif
   };
 };
-template<> struct unpacket_traits<Packet4d> {
+template <>
+struct unpacket_traits<Packet4d> {
   typedef double type;
   typedef Packet2d half;
-  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet8i> {
-  typedef int    type;
+template <>
+struct unpacket_traits<Packet8i> {
+  typedef int type;
   typedef Packet4i half;
-  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet8ui> {
+template <>
+struct unpacket_traits<Packet8ui> {
   typedef uint32_t type;
   typedef Packet4ui half;
-  enum {size = 8, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> struct unpacket_traits<Packet4l> {
-  typedef int64_t    type;
+template <>
+struct unpacket_traits<Packet4l> {
+  typedef int64_t type;
   typedef Packet4l half;
-  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet4ul> {
+template <>
+struct unpacket_traits<Packet4ul> {
   typedef uint64_t type;
   typedef Packet4ul half;
-  enum {size = 4, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 #endif
-template<> struct unpacket_traits<Packet8bf> {
+template <>
+struct unpacket_traits<Packet8bf> {
   typedef bfloat16 type;
   typedef Packet8bf half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 // Helper function for bit packing snippet of low precision comparison.
@@ -380,7 +451,7 @@
 EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
   return _mm256_add_epi64(a, b);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
   return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
 }
@@ -468,31 +539,33 @@
 }
 #ifdef EIGEN_VECTORIZE_AVX512FP16
 template <int N>
-EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) { return _mm256_srai_epi64(a, N); }
+EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) {
+  return _mm256_srai_epi64(a, N);
+}
 #else
 template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
   return a;
 }
 template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
   __m256i hi_word = _mm256_srai_epi32(a, N);
   __m256i lo_word = _mm256_srli_epi64(a, N);
   return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
 }
 template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
   __m256i hi_word = _mm256_srai_epi32(a, 31);
   __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
   return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
 }
 template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
   return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
 }
 template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
-  return parithmetic_shift_right<int(N&63)>(a);
+EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return parithmetic_shift_right<int(N & 63)>(a);
 }
 #endif
 template <>
@@ -523,7 +596,7 @@
   const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
   return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
 }
@@ -577,7 +650,7 @@
   Packet4ul pa = pset1<Packet4ul>(a);
   pstore(to, pa);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
   return _mm_cvtsi128_si64(_mm256_castsi256_si128(a));
 }
@@ -667,51 +740,102 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
-template<> EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) { return _mm256_set1_epi32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+  return _mm256_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) {
+  return _mm256_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) {
+  return _mm256_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) {
+  return _mm256_set1_epi32(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) {
+  return _mm256_castsi256_ps(pset1<Packet8i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) {
+  return _mm256_castsi256_pd(_mm256_set1_epi64x(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
-template<> EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) { return _mm256_setzero_si256(); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) {
+  return _mm256_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) {
+  return _mm256_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) {
+  return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) {
+  return _mm256_setzero_si256();
+}
 
+template <>
+EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) {
+  return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) {
+  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) {
+  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) {
+  return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1));
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
-template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) {
+  return _mm256_broadcast_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) {
+  return _mm256_broadcast_sd(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_add_ps(a, b);
+}
 #ifdef EIGEN_VECTORIZE_AVX512
 template <>
 EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b, uint8_t umask) {
   __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
-  return _mm512_castps512_ps256(_mm512_maskz_add_ps(
-                                    mask,
-                                    _mm512_castps256_ps512(a),
-                                    _mm512_castps256_ps512(b)));
+  return _mm512_castps512_ps256(_mm512_maskz_add_ps(mask, _mm512_castps256_ps512(a), _mm512_castps256_ps512(b)));
 }
 #endif
-template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_add_epi32(a,b);
+  return _mm256_add_epi32(a, b);
 #else
   __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
   __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_add_epi32(a, b);
 #else
@@ -721,24 +845,43 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) { return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7,6,5,4,3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) { return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) {
+  return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) {
+  return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) {
+  return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) {
+  return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_sub_epi32(a,b);
+  return _mm256_sub_epi32(a, b);
 #else
   __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
   __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_sub_epi32(a, b);
 #else
@@ -748,38 +891,54 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
   const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
   return _mm256_xor_ps(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
   const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
   return _mm256_xor_pd(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) {
   return psub(pzero(a), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_mullo_epi32(a,b);
+  return _mm256_mullo_epi32(a, b);
 #else
   const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
   const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_mullo_epi32(a, b);
 #else
@@ -789,11 +948,17 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_div_pd(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX512
   return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
 #else
@@ -845,20 +1010,48 @@
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) { return _mm256_cmp_ps(a,a,_CMP_UNORD_Q); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) {
+  return _mm256_cmp_ps(a, a, _CMP_UNORD_Q);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_xor_si256(_mm256_cmpgt_epi32(a,b), _mm256_set1_epi32(-1));
+  return _mm256_xor_si256(_mm256_cmpgt_epi32(a, b), _mm256_set1_epi32(-1));
 #else
   __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
   lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
@@ -867,25 +1060,28 @@
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_cmpgt_epi32(b,a);
+  return _mm256_cmpgt_epi32(b, a);
 #else
   __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
   __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_cmpeq_epi32(a,b);
+  return _mm256_cmpeq_epi32(a, b);
 #else
   __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
   __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_cmpeq_epi32(a, b);
 #else
@@ -895,32 +1091,35 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
   // There appears to be a bug in GCC, by which the optimizer may flip
   // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
   // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
   Packet8f res;
-  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
   return res;
 #else
   // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_ps(b,a);
+  return _mm256_min_ps(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
   // See pmin above
   Packet4d res;
-  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
   return res;
 #else
   // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_pd(b,a);
+  return _mm256_min_pd(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_min_epi32(a, b);
 #else
@@ -929,7 +1128,8 @@
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_min_epu32(a, b);
 #else
@@ -939,29 +1139,32 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
   // See pmin above
   Packet8f res;
-  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
   return res;
 #else
   // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_ps(b,a);
+  return _mm256_max_ps(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
   // See pmin above
   Packet4d res;
-  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
   return res;
 #else
   // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_pd(b,a);
+  return _mm256_max_pd(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_max_epi32(a, b);
 #else
@@ -970,7 +1173,8 @@
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_max_epu32(a, b);
 #else
@@ -981,129 +1185,174 @@
 }
 
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
   return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
 }
 #endif
 
 // Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet8f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet4d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet8f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet4d>);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) {
+  return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) {
+  return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) {
+  return _mm256_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) {
+  return _mm256_ceil_pd(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) {
+  return _mm256_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) {
+  return _mm256_floor_pd(a);
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   // vpcmpeqd has lower latency than the more general vcmpps
-  return _mm256_cmpeq_epi32(a,a);
+  return _mm256_cmpeq_epi32(a, a);
 #else
   const __m256 b = _mm256_castsi256_ps(a);
-  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+  return _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_TRUE_UQ));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   // vpcmpeqd has lower latency than the more general vcmpps
   const __m256i b = _mm256_castps_si256(a);
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b, b));
 #else
-  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+  return _mm256_cmp_ps(a, a, _CMP_TRUE_UQ);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   // vpcmpeqq has lower latency than the more general vcmppd
   const __m256i b = _mm256_castpd_si256(a);
-  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b, b));
 #else
-  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+  return _mm256_cmp_pd(a, a, _CMP_TRUE_UQ);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_and_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_and_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_and_si256(a,b);
+  return _mm256_and_si256(a, b);
 #else
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_and_si256(a,b);
+  return _mm256_and_si256(a, b);
 #else
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_or_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_or_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_or_si256(a,b);
+  return _mm256_or_si256(a, b);
 #else
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_or_si256(a,b);
+  return _mm256_or_si256(a, b);
 #else
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_xor_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_xor_si256(a,b);
+  return _mm256_xor_si256(a, b);
 #else
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_xor_si256(a, b);
 #else
@@ -1111,54 +1360,75 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_andnot_si256(b,a);
+  return _mm256_andnot_si256(b, a);
 #else
-  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
 #ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_andnot_si256(b,a);
+  return _mm256_andnot_si256(b, a);
 #else
-  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
   return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
   return pcmp_eq(a, pmin(a, b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) {
   const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
   const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
   return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
-template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) {
   const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
   const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
   return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
-{ return _mm256_blendv_ps(b,a,mask); }
-template<> EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b)
-{ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
-template<> EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b)
-{ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+  return _mm256_blendv_ps(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b) {
+  return _mm256_castps_si256(
+      _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b) {
+  return _mm256_castps_si256(
+      _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
-{ return _mm256_blendv_pd(b,a,mask); }
+template <>
+EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b) {
+  return _mm256_blendv_pd(b, a, mask);
+}
 
-template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_srai_epi32(a, N);
 #else
@@ -1168,7 +1438,8 @@
 #endif
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_srli_epi32(a, N);
 #else
@@ -1178,7 +1449,8 @@
 #endif
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_slli_epi32(a, N);
 #else
@@ -1188,33 +1460,62 @@
 #endif
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
   return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
 }
-template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
   return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
 }
-template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
   return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
 #ifdef EIGEN_VECTORIZE_AVX512
   __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
-  EIGEN_DEBUG_UNALIGNED_LOAD return  _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
 #else
   Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  const Packet8i bit_mask =
+      _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
   mask = por<Packet8i>(mask, bit_mask);
   mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
@@ -1222,41 +1523,44 @@
 }
 
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) {
   // TODO try to find a way to avoid the need of a temporary register
   //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
-//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
-//   return _mm256_unpacklo_ps(tmp,tmp);
+  //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+  //   return _mm256_unpacklo_ps(tmp,tmp);
 
   // _mm256_insertf128_ps is very slow on Haswell, thus:
   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
-  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  tmp = _mm256_blend_ps(
+      tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
   // then we can perform a consistent permutation on the global register to get everything in shape:
-  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+  return _mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2));
 }
 // Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) {
   Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
-  return  _mm256_permute_pd(tmp, 3<<2);
+  return _mm256_permute_pd(tmp, 3 << 2);
 }
 // Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from) {
 #ifdef EIGEN_VECTORIZE_AVX2
   const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
   return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
 #else
   __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
-  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  tmp = _mm256_blend_ps(
+      tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
   // then we can perform a consistent permutation on the global register to get everything in shape:
-  return  _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)));
+  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
 #ifdef EIGEN_VECTORIZE_AVX2
   const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
   return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
@@ -1272,43 +1576,72 @@
 }
 
 // Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) {
   Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
-  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from + 1), 1);
 }
-template<> EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from)
-{
-  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1);
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from) {
+  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
 }
-template<> EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
   return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
 #ifdef EIGEN_VECTORIZE_AVX512
   __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
 #else
   Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
+  const Packet8i bit_mask =
+      _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
   mask = por<Packet8i>(mask, bit_mask);
   mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
 #if EIGEN_COMP_MSVC
   // MSVC sometimes seems to use a bogus mask with maskstore.
   const __m256i ifrom = _mm256_castps_si256(from);
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast<char*>(to));
-  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast<char*>(to + 4));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
+                                                  reinterpret_cast<char*>(to));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
+                                                  reinterpret_cast<char*>(to + 4));
 #else
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
 #endif
@@ -1316,111 +1649,129 @@
 }
 
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
-// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
-template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
-{
-  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
-                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride),
+// 4);
+template <>
+EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+  return _mm256_set_ps(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+                       from[2 * stride], from[1 * stride], from[0 * stride]);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
-{
-  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) {
+  return _mm256_set_pd(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride)
-{
-  return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
-                          from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride) {
+  return _mm256_set_epi32(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+                          from[2 * stride], from[1 * stride], from[0 * stride]);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
+template <>
+EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
   return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
   __m128 low = _mm256_extractf128_ps(from, 0);
-  to[stride*0] = _mm_cvtss_f32(low);
-  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
-  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
-  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+  to[stride * 0] = _mm_cvtss_f32(low);
+  to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
 
   __m128 high = _mm256_extractf128_ps(from, 1);
-  to[stride*4] = _mm_cvtss_f32(high);
-  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
-  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
-  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+  to[stride * 4] = _mm_cvtss_f32(high);
+  to[stride * 5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride * 6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride * 7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) {
   __m128d low = _mm256_extractf128_pd(from, 0);
-  to[stride*0] = _mm_cvtsd_f64(low);
-  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  to[stride * 0] = _mm_cvtsd_f64(low);
+  to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
   __m128d high = _mm256_extractf128_pd(from, 1);
-  to[stride*2] = _mm_cvtsd_f64(high);
-  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+  to[stride * 2] = _mm_cvtsd_f64(high);
+  to[stride * 3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride) {
   __m128i low = _mm256_extractf128_si256(from, 0);
-  to[stride*0] = _mm_extract_epi32(low, 0);
-  to[stride*1] = _mm_extract_epi32(low, 1);
-  to[stride*2] = _mm_extract_epi32(low, 2);
-  to[stride*3] = _mm_extract_epi32(low, 3);
+  to[stride * 0] = _mm_extract_epi32(low, 0);
+  to[stride * 1] = _mm_extract_epi32(low, 1);
+  to[stride * 2] = _mm_extract_epi32(low, 2);
+  to[stride * 3] = _mm_extract_epi32(low, 3);
 
   __m128i high = _mm256_extractf128_si256(from, 1);
-  to[stride*4] = _mm_extract_epi32(high, 0);
-  to[stride*5] = _mm_extract_epi32(high, 1);
-  to[stride*6] = _mm_extract_epi32(high, 2);
-  to[stride*7] = _mm_extract_epi32(high, 3);
+  to[stride * 4] = _mm_extract_epi32(high, 0);
+  to[stride * 5] = _mm_extract_epi32(high, 1);
+  to[stride * 6] = _mm_extract_epi32(high, 2);
+  to[stride * 7] = _mm_extract_epi32(high, 3);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
   pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) {
   Packet8f pa = pset1<Packet8f>(a);
   pstore(to, pa);
 }
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) {
   Packet4d pa = pset1<Packet4d>(a);
   pstore(to, pa);
 }
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) {
   Packet8i pa = pset1<Packet8i>(a);
   pstore(to, pa);
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
   return _mm_cvtss_f32(_mm256_castps256_ps128(a));
 }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
   return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
 }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
   return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
-{
-  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+  __m256 tmp = _mm256_shuffle_ps(a, a, 0x1b);
   return _mm256_permute2f128_ps(tmp, tmp, 1);
 }
-template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
-{
-   __m256d tmp = _mm256_shuffle_pd(a,a,5);
+template <>
+EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
+  __m256d tmp = _mm256_shuffle_pd(a, a, 5);
   return _mm256_permute2f128_pd(tmp, tmp, 1);
 #if 0
   // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
@@ -1429,37 +1780,41 @@
     return _mm256_permute_pd(swap_halves,5);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {
   return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
 }
-template<> EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
   return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
 }
 
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a)
-    {
+template <>
+EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) {
   return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
 }
-template<> EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
   return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
 }
 #endif
 
 // pabs should be ok
-template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
-{
-  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm256_and_ps(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+                                                              0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+  return _mm256_and_ps(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
-{
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm256_and_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF,
+                                                              0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+  return _mm256_and_pd(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) {
 #ifdef EIGEN_VECTORIZE_AVX2
   return _mm256_abs_epi32(a);
 #else
@@ -1468,26 +1823,47 @@
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet8h  psignbit(const Packet8h&  a) { return _mm_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet8f  psignbit(const Packet8f&  a) { return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); }
-template<> EIGEN_STRONG_INLINE Packet8ui  psignbit(const Packet8ui& a)  { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
+  return _mm_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+  return _mm_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
+  return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) {
+  return pzero(a);
+}
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4d  psignbit(const Packet4d& a)  { return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); }
-template<> EIGEN_STRONG_INLINE Packet4ul  psignbit(const Packet4ul& a)  { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
+  return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) {
+  return pzero(a);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
-  return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+  return pfrexp_generic(a, exponent);
 }
 
 // Extract exponent without existence of Packet4l.
-template<>
-EIGEN_STRONG_INLINE  
-Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
-  const Packet4d cst_exp_mask  = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+  const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
   __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
 #ifdef EIGEN_VECTORIZE_AVX2
   a_expo = _mm256_srli_epi64(a_expo, 52);
@@ -1506,16 +1882,18 @@
   return exponent;
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
   return pldexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
   // Clamp exponent to [-2099, 2099]
   const Packet4d max_exponent = pset1<Packet4d>(2099.0);
   const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
@@ -1537,74 +1915,76 @@
   lo = _mm_slli_epi64(hi, 52);
   hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
   c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
-  out = pmul(out, c); // a * 2^e
+  out = pmul(out, c);  // a * 2^e
   return out;
 }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
-{
-  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
+template <>
+EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
+  return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
 }
-template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
-{
-  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
+template <>
+EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) {
+  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
 }
-template<> EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a)
-{
-  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1))));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a) {
+  return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
   return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
-{
-  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+template <>
+EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
+  return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
 }
-template<> EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a)
-{
-  return _mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1));
+template <>
+EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
+  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }
-template<> EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
   return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) {
   Packet8f tmp;
-  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
+  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) {
   Packet4d tmp;
-  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
-  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
+  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) {
+  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
+  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) {
+  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
   return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
-{
-  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
-  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
-  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) {
+  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
+  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
-{
-  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) {
+  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
@@ -1614,22 +1994,21 @@
 //   return _mm256_movemask_ps(x)==0xFF;
 // }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) {
   return _mm256_movemask_ps(x) != 0;
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) {
   return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
 }
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) {
   return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
@@ -1638,14 +2017,14 @@
   __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
   __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
   __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
-  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
-  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
-  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
-  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
-  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S4 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S5 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S6 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S7 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
   kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
   kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
   kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
@@ -1656,17 +2035,16 @@
   kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
   __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
   __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
   __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
 
-  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
 
   kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
   kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
@@ -1687,9 +2065,7 @@
 #define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
 #endif
 
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8i,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 8>& kernel) {
   __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
   __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
   __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
@@ -1698,14 +2074,14 @@
   __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
   __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
   __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
-  __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
-  __m256i S4 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(1,0,1,0));
-  __m256i S5 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(3,2,3,2));
-  __m256i S6 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(1,0,1,0));
-  __m256i S7 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S4 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S5 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S6 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S7 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
   kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
   kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
   kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
@@ -1719,17 +2095,16 @@
   ptranspose((PacketBlock<Packet8i, 8>&)kernel);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 4>& kernel) {
   __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
   __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
   __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
   __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
 
-  __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
-  __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
-  __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
-  __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
 
   kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
   kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
@@ -1740,8 +2115,7 @@
   ptranspose((PacketBlock<Packet8i, 4>&)kernel);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4d,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
   __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
   __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
   __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
@@ -1753,24 +2127,32 @@
   kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
+                                    const Packet8f& elsePacket) {
 #ifdef EIGEN_VECTORIZE_AVX2
   const __m256i zero = _mm256_setzero_si256();
-  const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  const __m256i select =
+      _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
+                       ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
   __m256i false_mask = _mm256_cmpeq_epi32(zero, select);
   return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask));
 #else
   const __m256 zero = _mm256_setzero_ps();
-  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
+                                      ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
   __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
   return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
+                                    const Packet4d& elsePacket) {
 #ifdef EIGEN_VECTORIZE_AVX2
   const __m256i zero = _mm256_setzero_si256();
-  const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  const __m256i select =
+      _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
   __m256i false_mask = _mm256_cmpeq_epi64(select, zero);
   return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask));
 #else
@@ -1783,35 +2165,52 @@
 
 // Packet math for Eigen::half
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+template <>
+struct unpacket_traits<Packet8h> {
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet8h half;
+};
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
   return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
   return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
   return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
   _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
   _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h
-ploaddup<Packet8h>(const Eigen::half*  from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
   const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
   const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
   const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
@@ -1819,14 +2218,15 @@
   return _mm_set_epi16(d, d, c, c, b, b, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
   const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
   const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
   return _mm_set_epi16(b, b, b, b, a, a, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
   return _mm_cmpeq_epi32(a, a);
 }
 
@@ -1840,8 +2240,8 @@
 #ifdef EIGEN_HAS_FP16_C
   return _mm256_cvtph_ps(a);
 #else
-  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(_mm256_insertf128_si256(
-      _mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
+  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(
+      _mm256_insertf128_si256(_mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
   return pp;
 #endif
 }
@@ -1852,19 +2252,17 @@
 #else
   __m128i lo = float2half(_mm256_extractf128_ps(a, 0));
   __m128i hi = float2half(_mm256_extractf128_ps(a, 1));
-  return   _mm_packus_epi32(lo, hi);
+  return _mm_packus_epi32(lo, hi);
 #endif
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
-                                            const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
   return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
-                                            const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
   return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
 }
 
@@ -1873,87 +2271,108 @@
   return float2half(plset<Packet8f>(static_cast<float>(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
   // in some cases Packet4i is a wrapper around __m128i, so we either need to
   // cast to Packet4i to directly call the intrinsics as below:
-  return _mm_or_si128(a,b);
+  return _mm_or_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
-  return _mm_xor_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+  return _mm_xor_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
-  return _mm_and_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+  return _mm_and_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
-  return _mm_andnot_si128(b,a);
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+  return _mm_andnot_si128(b, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
   return _mm_blendv_epi8(b, a, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
   return float2half(pround<Packet8f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
   return float2half(print<Packet8f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
   return float2half(pceil<Packet8f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
   return float2half(pfloor<Packet8f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
   return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
   return Pack16To8(pcmp_le(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
   return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
   return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
   Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
   return _mm_xor_si128(a, sign_mask);
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
   Packet8f rf = padd(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
   Packet8f rf = psub(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
   Packet8f rf = pmul(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
   Packet8f af = half2float(a);
   Packet8f bf = half2float(b);
   Packet8f rf = pdiv(af, bf);
@@ -1961,68 +2380,70 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
-{
-  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
-  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
-  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
-  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
-  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
-  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
-  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
-  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+template <>
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
   return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
 }
 
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
   EIGEN_ALIGN32 Eigen::half aux[8];
   pstore(aux, from);
-  to[stride*0] = aux[0];
-  to[stride*1] = aux[1];
-  to[stride*2] = aux[2];
-  to[stride*3] = aux[3];
-  to[stride*4] = aux[4];
-  to[stride*5] = aux[5];
-  to[stride*6] = aux[6];
-  to[stride*7] = aux[7];
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
 }
 
-
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
   Packet8f af = half2float(a);
   float reduced = predux<Packet8f>(af);
   return Eigen::half(reduced);
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
   Packet8f af = half2float(a);
   float reduced = predux_max<Packet8f>(af);
   return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
   Packet8f af = half2float(a);
   float reduced = predux_min<Packet8f>(af);
   return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
   Packet8f af = half2float(a);
   float reduced = predux_mul<Packet8f>(af);
   return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  return _mm_shuffle_epi8(a,m);
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_shuffle_epi8(a, m);
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
   __m128i a = kernel.packet[0];
   __m128i b = kernel.packet[1];
   __m128i c = kernel.packet[2];
@@ -2069,8 +2490,7 @@
   kernel.packet[7] = a7b7c7d7e7f7g7h7;
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
   EIGEN_ALIGN32 Eigen::half in[4][8];
   pstore<Eigen::half>(in[0], kernel.packet[0]);
   pstore<Eigen::half>(in[1], kernel.packet[1]);
@@ -2081,10 +2501,10 @@
 
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2*i];
+      out[i][j] = in[j][2 * i];
     }
     for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][2*i+1];
+      out[i][j + 4] = in[j][2 * i + 1];
     }
   }
 
@@ -2111,7 +2531,6 @@
 
 // Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
-
   __m256i input = _mm256_castps_si256(a);
 
 #ifdef EIGEN_VECTORIZE_AVX2
@@ -2130,8 +2549,7 @@
   __m256i nan = _mm256_set1_epi32(0x7fc0);
   t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
   // output = numext::bit_cast<uint16_t>(input);
-  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
-                         _mm256_extractf128_si256(t, 1));
+  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), _mm256_extractf128_si256(t, 1));
 #else
   // uint32_t lsb = (input >> 16);
   __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
@@ -2158,32 +2576,38 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
   return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
   return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
   return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
   _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
   _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf
-ploaddup<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
   const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
   const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
   const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
@@ -2191,14 +2615,15 @@
   return _mm_set_epi16(d, d, c, c, b, b, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf
-ploadquad<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
   const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
   const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
   return _mm_set_epi16(b, b, b, b, a, a, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
   return _mm_cmpeq_epi32(a, a);
 }
 
@@ -2209,14 +2634,12 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
-                                                const Packet8bf& b) {
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
-                                                const Packet8bf& b) {
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
@@ -2225,131 +2648,153 @@
   return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
-  return _mm_or_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_or_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
-  return _mm_xor_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_xor_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
-  return _mm_and_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_and_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
-  return _mm_andnot_si128(b,a);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_andnot_si128(b, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
   return _mm_blendv_epi8(b, a, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
   return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
   return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
   return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
   return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
   return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
   return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
   return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
   return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
   Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
   return _mm_xor_si128(a, sign_mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
-{
-  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
-  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
-  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
-  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
-  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
-  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
-  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
-  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
   return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
 }
 
-template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) {
   EIGEN_ALIGN32 bfloat16 aux[8];
   pstore(aux, from);
-  to[stride*0] = aux[0];
-  to[stride*1] = aux[1];
-  to[stride*2] = aux[2];
-  to[stride*3] = aux[3];
-  to[stride*4] = aux[4];
-  to[stride*5] = aux[5];
-  to[stride*6] = aux[6];
-  to[stride*7] = aux[7];
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
   return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
   return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
   return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
   return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  return _mm_shuffle_epi8(a,m);
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_shuffle_epi8(a, m);
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
   __m128i a = kernel.packet[0];
   __m128i b = kernel.packet[1];
   __m128i c = kernel.packet[2];
@@ -2387,8 +2832,7 @@
   kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
   __m128i a = kernel.packet[0];
   __m128i b = kernel.packet[1];
   __m128i c = kernel.packet[2];
@@ -2405,8 +2849,8 @@
   kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_AVX_H
+#endif  // EIGEN_PACKET_MATH_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 49927b8..3688f8d 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -18,28 +18,39 @@
 namespace internal {
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
 
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
 
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
 
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
 
-template<> struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
-template<> struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
 
-template<> struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
-template<> struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
 #endif
 
 template <>
-EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a,
-                                                         const Packet8f& b) {
+EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a, const Packet8f& b) {
   __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
   __m256 nonzero_b = _mm256_cmp_ps(b, pzero(b), _CMP_NEQ_UQ);
   constexpr char kFF = '\255';
@@ -54,11 +65,11 @@
   __m128i b_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 0), shuffle_mask128_b_lo);
   __m128i merged = _mm_or_si128(_mm_or_si128(b_lo, b_hi), _mm_or_si128(a_lo, a_hi));
   return _mm_and_si128(merged, _mm_set1_epi8(1));
- #else
-  __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,  12,   8,   4,   0, kFF, kFF, kFF, kFF,
-                                           kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,  12,   8,   4,   0);
-  __m256i b_shuffle_mask = _mm256_set_epi8( 12,   8,   4,   0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
-                                           kFF, kFF, kFF, kFF,  12,   8,   4,   0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
+#else
+  __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF,
+                                           kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
+  __m256i b_shuffle_mask = _mm256_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
+                                           kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
   __m256i a_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_a), a_shuffle_mask);
   __m256i b_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_b), b_shuffle_mask);
   __m256i a_or_b = _mm256_or_si256(a_shuff, b_shuff);
@@ -70,124 +81,147 @@
 template <>
 EIGEN_STRONG_INLINE Packet8f pcast<Packet16b, Packet8f>(const Packet16b& a) {
   const __m256 cst_one = _mm256_set1_ps(1.0f);
-  #ifdef EIGEN_VECTORIZE_AVX2
+#ifdef EIGEN_VECTORIZE_AVX2
   __m256i a_extended = _mm256_cvtepi8_epi32(a);
   __m256i abcd_efgh = _mm256_cmpeq_epi32(a_extended, _mm256_setzero_si256());
-  #else
+#else
   __m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
   __m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
   __m128i aaaa_bbbb_cccc_dddd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
   __m128i eeee_ffff_gggg_hhhh = _mm_unpackhi_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
   __m256i abcd_efgh = _mm256_setr_m128i(aaaa_bbbb_cccc_dddd, eeee_ffff_gggg_hhhh);
-  #endif
+#endif
   __m256 result = _mm256_andnot_ps(_mm256_castsi256_ps(abcd_efgh), cst_one);
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
   return _mm256_cvttps_epi32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
   return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(const Packet4d& a) {
   return _mm256_cvttpd_epi32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
   return _mm256_cvtepi32_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
   return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(const Packet4d& a) {
   return _mm256_cvtpd_ps(a);
 }
 
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(const Packet8i& a) {
   return _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(const Packet4i& a) {
   return _mm256_cvtepi32_pd(a);
 }
 
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(const Packet8f& a) {
   return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(const Packet4f& a) {
   return _mm256_cvtps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8f>(const Packet8f& a) {
   return _mm256_castps_si256(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet8i>(const Packet8i& a) {
   return _mm256_castsi256_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(const Packet8i& a) {
   return Packet8ui(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(const Packet8ui& a) {
   return Packet8i(a);
 }
 
 // truncation operations
 
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(const Packet8f& a) {
   return _mm256_castps256_ps128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(const Packet4d& a) {
   return _mm256_castpd256_pd128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(const Packet8i& a) {
   return _mm256_castsi256_si128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui& a) {
   return _mm256_castsi256_si128(a);
 }
 
-
 #ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
   return Packet4ul(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul& a) {
   return Packet4l(a);
 }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
   return half2float(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
   return Bf16ToF32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
   return float2half(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
   return F32ToBf16(a);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_AVX_H
+#endif  // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index c484517..f2c8ce6 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -18,15 +18,14 @@
 namespace internal {
 
 //---------- float ----------
-struct Packet8cf
-{
+struct Packet8cf {
   EIGEN_STRONG_INLINE Packet8cf() {}
   EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
-  __m512  v;
+  __m512 v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet8cf type;
   typedef Packet4cf half;
   enum {
@@ -34,58 +33,80 @@
     AlignedOnScalar = 1,
     size = 8,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet8cf> {
+template <>
+struct unpacket_traits<Packet8cf> {
   typedef std::complex<float> type;
   typedef Packet4cf half;
   typedef Packet16f as_real;
   enum {
     size = 8,
-    alignment=unpacket_traits<Packet16f>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    alignment = unpacket_traits<Packet16f>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(ptrue(Packet16f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(_mm512_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(_mm512_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) {
   return Packet8cf(pnegate(a.v));
 }
-template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) {
   const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet8cf(pxor(a.v,mask));
+      0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+      0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000));
+  return Packet8cf(pxor(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
-  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+template <>
+EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
   return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet8cf pand<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf por<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pxor<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pandnot(a.v, b.v));
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
@@ -93,60 +114,71 @@
   return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet8cf pload<Packet8cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from)));
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
   const float re = std::real(from);
   const float im = std::imag(from);
   return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(_mm512_castpd_ps(ploaddup<Packet8d>((const double*)(const void*)from)));
 }
-template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(_mm512_castpd_ps(ploadquad<Packet8d>((const double*)(const void*)from)));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet8cf(_mm512_castpd_ps(pgather<double, Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from,
+                                                                       Index stride) {
   pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet8cf>(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a) {
   return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
-  return Packet8cf(_mm512_castsi512_ps(
-            _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
-                                      _mm512_castps_si512(a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+  return Packet8cf(_mm512_castsi512_ps(_mm512_permutexvar_epi64(
+      _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), _mm512_castps_si512(a.v))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
-{
-  return predux(padd(Packet4cf(extract256<0>(a.v)),
-                     Packet4cf(extract256<1>(a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+  return predux(padd(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
-{
-  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
-                         Packet4cf(extract256<1>(a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
 }
 
 template <>
@@ -157,28 +189,27 @@
   return Packet4cf(res);
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f)
 
-template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
-{
-  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x) {
+  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
 }
 
 //---------- double ----------
-struct Packet4cd
-{
+struct Packet4cd {
   EIGEN_STRONG_INLINE Packet4cd() {}
   EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
-  __m512d  v;
+  __m512d v;
 };
 
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet4cd type;
   typedef Packet2cd half;
   enum {
@@ -186,58 +217,82 @@
     AlignedOnScalar = 0,
     size = 4,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet4cd> {
+template <>
+struct unpacket_traits<Packet4cd> {
   typedef std::complex<double> type;
   typedef Packet2cd half;
   typedef Packet8d as_real;
   enum {
     size = 4,
     alignment = unpacket_traits<Packet8d>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
-{
-  const __m512d mask = _mm512_castsi512_pd(
-          _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
-                           0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet4cd(pxor(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(_mm512_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(_mm512_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) {
+  return Packet4cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) {
+  const __m512d mask = _mm512_castsi512_pd(_mm512_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0,
+                                                            0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  return Packet4cd(pxor(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
-  __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
-  __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
-  __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
-  __m512d odd  = _mm512_mul_pd(tmp2, tmp3);
+template <>
+EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  __m512d tmp1 = _mm512_shuffle_pd(a.v, a.v, 0x0);
+  __m512d tmp2 = _mm512_shuffle_pd(a.v, a.v, 0xFF);
+  __m512d tmp3 = _mm512_shuffle_pd(b.v, b.v, 0x55);
+  __m512d odd = _mm512_mul_pd(tmp2, tmp3);
   return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(ptrue(Packet8d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pand<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd por<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pxor<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pandnot(a.v, b.v));
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
@@ -245,81 +300,95 @@
   return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
-{
-  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+template <>
+EIGEN_STRONG_INLINE Packet4cd pload<Packet4cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+template <>
+EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4(_mm_castpd_ps(pset1<Packet1cd>(from).v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+  return Packet4cd(
+      _mm512_insertf64x4(_mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from + 1).v, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from,
+                                                                            Index stride) {
   return Packet4cd(_mm512_insertf64x4(
-          _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+      _mm512_castpd256_pd512(_mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 0 * stride).v),
+                                                  ploadu<Packet1cd>(from + 1 * stride).v, 1)),
+      _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 2 * stride).v),
+                           ploadu<Packet1cd>(from + 3 * stride).v, 1),
+      1));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
-{
-  return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from,
+                                                                        Index stride) {
   __m512i fromi = _mm512_castpd_si512(from.v);
   double* tod = (double*)(void*)to;
-  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
-  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
-  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
-  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+  _mm_storeu_pd(tod + 0 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 0)));
+  _mm_storeu_pd(tod + 2 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 1)));
+  _mm_storeu_pd(tod + 4 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 2)));
+  _mm_storeu_pd(tod + 6 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a) {
   __m128d low = extract128<0>(a.v);
   EIGEN_ALIGN16 double res[2];
   _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
+  return std::complex<double>(res[0], res[1]);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
-  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
+template <>
+EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3, 2, 1, 0>::mask)));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
-{
-  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                     Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
-{
-  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d)
 
-template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
-{
-  return Packet4cd(_mm512_permute_pd(x.v,0x55));
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x) {
+  return Packet4cd(_mm512_permute_pd(x.v, 0x55));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,4>& kernel) {
-  PacketBlock<Packet8d,4> pb;
-  
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+  PacketBlock<Packet8d, 4> pb;
+
   pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
   pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
   pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
@@ -331,10 +400,9 @@
   kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,8>& kernel) {
-  PacketBlock<Packet8d,8> pb;
-  
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+  PacketBlock<Packet8d, 8> pb;
+
   pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
   pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
   pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
@@ -354,28 +422,33 @@
   kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cd,4>& kernel) {
-  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
-  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
-  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
-  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+  __m512d T0 =
+      _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0, 1, 0, 1>::mask));  // [a0 a1 b0 b1]
+  __m512d T1 =
+      _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2, 3, 2, 3>::mask));  // [a2 a3 b2 b3]
+  __m512d T2 =
+      _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0, 1, 0, 1>::mask));  // [c0 c1 d0 d1]
+  __m512d T3 =
+      _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2, 3, 2, 3>::mask));  // [c2 c3 d2 d3]
 
-  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
-  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
-  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
-  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1, 3, 1, 3>::mask)));  // [a3 b3 c3 d3]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a2 b2 c2 d2]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1, 3, 1, 3>::mask)));  // [a1 b1 c1 d1]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a0 b0 c0 d0]
 }
 
-template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
   return psqrt_complex<Packet4cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
   return psqrt_complex<Packet8cf>(a);
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_AVX512_H
+#endif  // EIGEN_COMPLEX_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 2df1704..e06b83c 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -639,7 +639,8 @@
     }
   }
 
-  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
+  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
+            bool no_a_preload = false>
   EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
                                            Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
     const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
@@ -696,7 +697,8 @@
    *  bo += b_unroll * kfactor;
    */
 
-  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch, bool no_a_preload = false>
+  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
+            bool no_a_preload = false>
   EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
     int fetchA_idx = 0;
     int fetchB_idx = 0;
@@ -705,20 +707,21 @@
     const bool ktail = k_factor == 1;
 
     static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
-    static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1), "skipping a preload only allowed when k unroll is 1");
+    static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1),
+                  "skipping a preload only allowed when k unroll is 1");
 
     if (k_factor > 0)
-      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
-                                                                                    fetchB_idx);
+      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
     if (k_factor > 1)
-      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
-                                                                                    fetchB_idx);
+      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
     if (k_factor > 2)
-      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
-                                                                                    fetchB_idx);
+      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
     if (k_factor > 3)
-      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
-                                                                                    fetchB_idx);
+      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
 
     // Advance A/B pointers after uk-loop.
     ao += a_unroll * k_factor;
@@ -1201,10 +1204,9 @@
 
 template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
-  EIGEN_ALWAYS_INLINE
-  void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth,
-                  Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
-                  Index offsetB = 0);
+  EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
+                                      Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
+                                      Index offsetA = 0, Index offsetB = 0);
 };
 
 template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
@@ -1233,7 +1235,7 @@
     }
   }
 }
-#endif // EIGEN_USE_AVX512_GEMM_KERNELS
+#endif  // EIGEN_USE_AVX512_GEMM_KERNELS
 
 }  // namespace internal
 }  // namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 08e5fe8..0677248 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,14 +47,12 @@
 
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
-psqrt<Packet16f>(const Packet16f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) {
   return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
-psqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
   return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
 #else
@@ -82,26 +80,24 @@
 #elif EIGEN_FAST_MATH
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
-prsqrt<Packet16f>(const Packet16f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) {
   return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
 }
 #endif
 
-
 // prsqrt for double.
 #if EIGEN_FAST_MATH
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
-prsqrt<Packet8d>(const Packet8d& _x) {
-  #ifdef EIGEN_VECTORIZE_AVX512ER
-    return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
-  #else
-    return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
-  #endif
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+#else
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512ER
   return _mm512_rcp28_ps(a);
 #else
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index c6566a4..b6d2d98 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -53,7 +53,10 @@
 };
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet16h> {
+  enum { value = true };
+};
 
 template <>
 struct packet_traits<half> : default_packet_traits {
@@ -65,41 +68,41 @@
     AlignedOnScalar = 1,
     size = 16,
 
-    HasCmp    = 1,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 1,
-    HasAbs2   = 0,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 0,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasLog    = 1,
-    HasLog1p  = 1,
-    HasExp    = 1,
-    HasExpm1  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
     HasBessel = 1,
-    HasNdtri  = 1,
-    HasSin    = EIGEN_FAST_MATH,
-    HasCos    = EIGEN_FAST_MATH,
-    HasTanh   = EIGEN_FAST_MATH,
-    HasErf    = EIGEN_FAST_MATH,
-    HasBlend  = 0,
-    HasRound  = 1,
-    HasFloor  = 1,
-    HasCeil   = 1,
-    HasRint   = 1
+    HasNdtri = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1
   };
 };
 #endif
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet16f type;
   typedef Packet8f half;
   enum {
@@ -108,9 +111,9 @@
     size = 16,
 
     HasAbs = 1,
-    HasMin   = 1,
-    HasMax   = 1,
-    HasConj  = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasBlend = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
@@ -121,24 +124,24 @@
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLog = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasNdtri = 1,
-    HasBessel  = 1,
+    HasBessel = 1,
     HasExp = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
     HasRint = 1
   };
- };
-template<> struct packet_traits<double> : default_packet_traits
-{
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet8d type;
   typedef Packet4d half;
   enum {
@@ -148,10 +151,10 @@
     HasBlend = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasLog  = 1,
+    HasLog = 1,
     HasExp = 1,
     HasATan = 1,
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasRound = 1,
     HasFloor = 1,
@@ -160,18 +163,11 @@
   };
 };
 
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet16i type;
   typedef Packet8i half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    HasBlend = 0,
-    HasCmp = 1,
-    HasDiv = 1,
-    size=16
-  };
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
 };
 
 template <>
@@ -180,28 +176,54 @@
   typedef Packet8f half;
   typedef Packet16i integer_packet;
   typedef uint16_t mask_t;
-  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true };
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true,
+    masked_fpops_available = true
+  };
 };
 template <>
 struct unpacket_traits<Packet8d> {
   typedef double type;
   typedef Packet4d half;
   typedef uint8_t mask_t;
-  enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true };
+  enum {
+    size = 8,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true,
+    masked_fpops_available = true
+  };
 };
 template <>
 struct unpacket_traits<Packet16i> {
   typedef int type;
   typedef Packet8i half;
-  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<>
+template <>
 struct unpacket_traits<Packet16h> {
   typedef Eigen::half type;
   typedef Packet8h half;
-  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 #endif
 
@@ -228,21 +250,30 @@
   return _mm512_castsi512_pd(_mm512_set1_epi64(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) {
+  return _mm512_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) {
+  return _mm512_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
+  return _mm512_setzero_si512();
+}
 
-template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
-  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
-                                              0, -1, 0, -1, 0, -1, 0, -1));
+template <>
+EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
 }
-template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
-  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
-                          0, -1, 0, -1, 0, -1, 0, -1);
+template <>
+EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
 }
-template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
-  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
-                                              0, 0, -1, -1, 0, 0, -1, -1));
+template <>
+EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
 }
 
 template <>
@@ -251,7 +282,7 @@
   // Inline asm here helps reduce some register spilling in TRSM kernels.
   // See note in unrolls::gemm::microKernel in TrsmKernel.h
   Packet16f ret;
-  __asm__  ("vbroadcastss %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from));
+  __asm__("vbroadcastss %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
   return ret;
 #else
   return _mm512_broadcastss_ps(_mm_load_ps1(from));
@@ -261,7 +292,7 @@
 EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
 #if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
   Packet8d ret;
-  __asm__  ("vbroadcastsd %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from));
+  __asm__("vbroadcastsd %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
   return ret;
 #else
   return _mm512_set1_pd(*from);
@@ -270,67 +301,52 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
-  return _mm512_add_ps(
-      _mm512_set1_ps(a),
-      _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
-                    4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+  return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
+                                                        6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
-  return _mm512_add_pd(_mm512_set1_pd(a),
-                       _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+  return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
-  return _mm512_add_epi32(
-      _mm512_set1_epi32(a),
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+  return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
   return _mm512_add_ps(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b) {
   return _mm512_add_pd(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_add_epi32(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b,
-                                              uint16_t umask) {
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
   __mmask16 mask = static_cast<__mmask16>(umask);
   return _mm512_maskz_add_ps(mask, a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b,
-                                            uint8_t umask) {
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b, uint8_t umask) {
   __mmask8 mask = static_cast<__mmask8>(umask);
   return _mm512_maskz_add_pd(mask, a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
   return _mm512_sub_ps(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, const Packet8d& b) {
   return _mm512_sub_pd(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_sub_epi32(a, b);
 }
 
@@ -339,16 +355,16 @@
   // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
   //       The intel docs give it a relatively high latency as well, so we're probably
   //       better off with using _mm512_set_epi32 directly anyways.
-  const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000,
-                                        0x80000000,0x80000000,0x80000000,0x80000000,
-                                        0x80000000,0x80000000,0x80000000,0x80000000,
-                                        0x80000000,0x80000000,0x80000000,0x80000000);
+  const __m512i mask =
+      _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
+                       0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
   return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
-                                        0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+  const __m512i mask =
+      _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
+                       0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
   return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 template <>
@@ -370,202 +386,186 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
   return _mm512_mul_ps(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, const Packet8d& b) {
   return _mm512_mul_pd(a, b);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_mullo_epi32(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
   return _mm512_div_ps(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, const Packet8d& b) {
   return _mm512_div_pd(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b,0));
+EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
   Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
   return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
 }
 
 #ifdef EIGEN_VECTORIZE_FMA
 template <>
-EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
-                                    const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
   return _mm512_fmadd_ps(a, b, c);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
-                                   const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
   return _mm512_fmadd_pd(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b,
-                                    const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
   return _mm512_fmsub_ps(a, b, c);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b,
-                                   const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
   return _mm512_fmsub_pd(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b,
-                                    const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
   return _mm512_fnmadd_ps(a, b, c);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b,
-                                   const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
   return _mm512_fnmadd_pd(a, b, c);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b,
-                                    const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
   return _mm512_fnmsub_ps(a, b, c);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b,
-                                   const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
   return _mm512_fnmsub_pd(a, b, c);
 }
 #endif
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
-                                           const Packet16f& a,
-                                           const Packet16f& b) {
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
   __mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
   return _mm512_mask_blend_ps(mask16, a, b);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask,
-                                           const Packet16i& a,
-                                           const Packet16i& b) {
+EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16i& a, const Packet16i& b) {
   __mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
   return _mm512_mask_blend_epi32(mask16, a, b);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
-                                          const Packet8d& a,
-                                          const Packet8d& b) {
-  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
-                                         _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
   return _mm512_mask_blend_pd(mask8, a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, const Packet16f& b) {
   // Arguments are reversed to match NaN propagation behavior of std::min.
   return _mm512_min_ps(b, a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, const Packet8d& b) {
   // Arguments are reversed to match NaN propagation behavior of std::min.
   return _mm512_min_pd(b, a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_min_epi32(b, a);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
   // Arguments are reversed to match NaN propagation behavior of std::max.
   return _mm512_max_ps(b, a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, const Packet8d& b) {
   // Arguments are reversed to match NaN propagation behavior of std::max.
   return _mm512_max_pd(b, a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
   return _mm512_max_epi32(b, a);
 }
 
 // Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet16f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet8d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet16f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet8d>);
 }
 
-
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
-template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
-EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
-EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) { return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1); }
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return _mm512_extractf32x8_ps(x, I_);
+}
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm512_extractf64x2_pd(x, I_);
+}
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+}
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
+  return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
+}
 #else
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
-template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
-  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
 }
 
 // AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
-template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
-  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
 }
 
 EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
-  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
-                                                _mm256_castps_si256(b),1));
+  return _mm512_castsi512_ps(
+      _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
 }
 EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
   return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
@@ -584,10 +584,8 @@
   //   dst[255:240] := Saturate16(rf[255:224])
   __m256i lo = _mm256_castps_si256(extract256<0>(rf));
   __m256i hi = _mm256_castps_si256(extract256<1>(rf));
-  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
-                                      _mm256_extractf128_si256(lo, 1));
-  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
-                                      _mm256_extractf128_si256(hi, 1));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
   return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
 }
 
@@ -600,36 +598,38 @@
 template <>
 EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
 }
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
   __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
   return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
 }
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
   return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
 }
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
   __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
   return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
 }
@@ -637,36 +637,50 @@
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
   __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
 
-template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
-template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
+}
 
-template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
-template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
@@ -684,23 +698,20 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
-                                              const Packet16i& b) {
-  return _mm512_and_si512(a,b);
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_and_si512(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_and_ps(a, b);
 #else
-  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_and_pd(a, b);
 #else
@@ -725,17 +736,16 @@
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_or_ps(a, b);
 #else
-  return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
 #endif
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
-                                           const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_or_pd(a, b);
 #else
-  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
 #endif
 }
 
@@ -749,7 +759,7 @@
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_xor_ps(a, b);
 #else
-  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
 #endif
 }
 
@@ -758,7 +768,7 @@
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_xor_pd(a, b);
 #else
-  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
 #endif
 }
 
@@ -772,42 +782,45 @@
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_andnot_ps(b, a);
 #else
-  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_andnot_pd(b, a);
 #else
-  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a) {
   // Work-around for default std::round rounding mode.
   const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
   const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
   return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
-template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a) {
   // Work-around for default std::round rounding mode.
   const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
   const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
   return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
   return _mm512_srai_epi32(a, N);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
   return _mm512_srli_epi32(a, N);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
   return _mm512_slli_epi32(a, N);
 }
 
@@ -821,8 +834,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
-  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-    reinterpret_cast<const __m512i*>(from));
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(reinterpret_cast<const __m512i*>(from));
 }
 
 template <>
@@ -835,8 +847,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
-      reinterpret_cast<const __m512i*>(from));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(from));
 }
 
 template <>
@@ -868,7 +879,7 @@
 // a3}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
- __m512d x = _mm512_setzero_pd();
+  __m512d x = _mm512_setzero_pd();
   x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
   x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
   x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
@@ -879,10 +890,10 @@
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
   __m512d x = _mm512_setzero_pd();
-  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
-  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 0, _mm_load_sd(from + 0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 2, _mm_load_sd(from + 1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 4, _mm_load_sd(from + 2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3 << 6, _mm_load_sd(from + 3));
   return x;
 }
 #endif
@@ -902,7 +913,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
   Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
-  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
   return _mm512_permutexvar_ps(scatter_mask, tmp);
 }
 
@@ -911,7 +922,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
   __m256d lane0 = _mm256_set1_pd(*from);
-  __m256d lane1 = _mm256_set1_pd(*(from+1));
+  __m256d lane1 = _mm256_set1_pd(*(from + 1));
   __m512d tmp = _mm512_undefined_pd();
   tmp = _mm512_insertf64x4(tmp, lane0, 0);
   return _mm512_insertf64x4(tmp, lane1, 1);
@@ -922,7 +933,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
   Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
-  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
   return _mm512_permutexvar_epi32(scatter_mask, tmp);
 }
 
@@ -936,8 +947,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
-                                                from);
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
 }
 
 template <>
@@ -950,8 +960,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
-      reinterpret_cast<__m512i*>(to), from);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
@@ -965,25 +974,20 @@
 }
 
 template <typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from,
-    Index stride, typename unpacket_traits<Packet>::mask_t umask);
+EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, Index stride,
+                                        typename unpacket_traits<Packet>::mask_t umask);
 template <>
-EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src,
-                                                             const float* from,
-                                                             Index stride,
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src, const float* from, Index stride,
                                                              uint16_t umask) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
   __mmask16 mask = static_cast<__mmask16>(umask);
 
   return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
 }
 template <>
-EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src,
-                                                            const double* from,
-                                                            Index stride,
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src, const double* from, Index stride,
                                                             uint8_t umask) {
   Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
@@ -994,18 +998,15 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
-                                                             Index stride) {
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
 
   return _mm512_i32gather_ps(indices, from, 4);
 }
 template <>
-EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
-                                                            Index stride) {
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, Index stride) {
   Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@@ -1013,34 +1014,27 @@
   return _mm512_i32gather_pd(indices, from, 8);
 }
 template <>
-EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from,
-                                                           Index stride) {
+EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
   return _mm512_i32gather_epi32(indices, from, 4);
 }
 
 template <typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from,
-    Index stride, typename unpacket_traits<Packet>::mask_t umask);
+EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index stride,
+                                       typename unpacket_traits<Packet>::mask_t umask);
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
-                                                         const Packet16f& from,
-                                                         Index stride,
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride,
                                                          uint16_t umask) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
   __mmask16 mask = static_cast<__mmask16>(umask);
   _mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
 }
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
-                                                         const Packet8d& from,
-                                                         Index stride,
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride,
                                                          uint8_t umask) {
   Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
@@ -1050,31 +1044,23 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
-                                                         const Packet16f& from,
-                                                         Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
   _mm512_i32scatter_ps(to, indices, from, 4);
 }
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
-                                                         const Packet8d& from,
-                                                         Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride) {
   Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
   Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
   _mm512_i32scatter_pd(to, indices, from, 8);
 }
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to,
-                                                       const Packet16i& from,
-                                                       Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
   Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
-  Packet16i stride_multiplier =
-      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
   _mm512_i32scatter_epi32(to, indices, from, 4);
 }
@@ -1095,9 +1081,18 @@
   pstore(to, pa);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@@ -1112,69 +1107,81 @@
   return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
   return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) {
   return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
   return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
   // _mm512_abs_ps intrinsic not found, so hack around it
   return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
   // _mm512_abs_ps intrinsic not found, so hack around it
-  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
-                                   _mm512_set1_epi64(0x7fffffffffffffff)));
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
 }
-template<> EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
   return _mm512_abs_epi32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h  psignbit(const Packet16h&  a) { return _mm256_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) { return _mm256_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet16f  psignbit(const Packet16f&  a) { return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31)); }
-template<> EIGEN_STRONG_INLINE Packet8d   psignbit(const Packet8d&   a) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63)); }
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
+  return _mm256_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
+  return _mm256_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) {
+  return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) {
+  return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
+}
 
-template<>
-EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
+template <>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
 // Extract exponent without existence of Packet8l.
-template<>
-EIGEN_STRONG_INLINE
-Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
-  const Packet8d cst_exp_mask  = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
-  #ifdef EIGEN_VECTORIZE_AVX512DQ
+template <>
+EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+#ifdef EIGEN_VECTORIZE_AVX512DQ
   return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
-  #else
+#else
   return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
-  #endif
+#endif
 }
 
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
   return pldexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
   // Clamp exponent to [-2099, 2099]
   const Packet8d max_exponent = pset1<Packet8d>(2099.0);
   const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
@@ -1203,30 +1210,26 @@
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
-#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
-  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);                    \
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)        \
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
   __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
 
 // AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
-#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)                           \
-  __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0);                \
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)            \
+  __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
   __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
 #else
-#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
-  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
-      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
-      _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
-  __m256 OUTPUT##_1 = _mm256_insertf128_ps(                     \
-      _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
-      _mm512_extractf32x4_ps(INPUT, 3), 1)
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                                                     \
+  __m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
+                                           _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
+  __m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
+                                           _mm512_extractf32x4_ps(INPUT, 3), 1)
 
-#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)                    \
-  __m256i OUTPUT##_0 = _mm256_insertf128_si256(                     \
-      _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)),  \
-      _mm512_extracti32x4_epi32(INPUT, 1), 1);                      \
-  __m256i OUTPUT##_1 = _mm256_insertf128_si256(                     \
-      _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)),  \
-      _mm512_extracti32x4_epi32(INPUT, 3), 1)
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)                                                            \
+  __m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
+                                               _mm512_extracti32x4_epi32(INPUT, 1), 1);                     \
+  __m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
+                                               _mm512_extracti32x4_epi32(INPUT, 3), 1)
 #endif
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -1243,7 +1246,7 @@
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
   OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
 
-#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB)                    \
+#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB)                       \
   OUTPUT = _mm512_undefined_epi32();                                           \
   OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
   OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
@@ -1337,7 +1340,7 @@
 
 template <>
 EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-//#ifdef EIGEN_VECTORIZE_AVX512DQ
+// #ifdef EIGEN_VECTORIZE_AVX512DQ
 #if 0
   Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
   Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
@@ -1403,17 +1406,17 @@
   return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) {
   Packet16i xi = _mm512_castps_si512(x);
-  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
-  return !_mm512_kortestz(tmp,tmp);
+  __mmask16 tmp = _mm512_test_epi32_mask(xi, xi);
+  return !_mm512_kortestz(tmp, tmp);
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x)
-{
-  __mmask16 tmp = _mm512_test_epi32_mask(x,x);
-  return !_mm512_kortestz(tmp,tmp);
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x) {
+  __mmask16 tmp = _mm512_test_epi32_mask(x, x);
+  return !_mm512_kortestz(tmp, tmp);
 }
 
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1530,28 +1533,27 @@
   PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
   PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
 }
-#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE)         \
-  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
-                           INPUT[2 * INDEX + STRIDE]);
+#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
-  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0],kernel.packet[1]);
-  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0],kernel.packet[1]);
-  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2],kernel.packet[3]);
-  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2],kernel.packet[3]);
-  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4],kernel.packet[5]);
-  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4],kernel.packet[5]);
-  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6],kernel.packet[7]);
-  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6],kernel.packet[7]);
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
 
-  kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2)));
-  kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2)));
-  kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3)));
-  kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3)));
-  kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6)));
-  kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6)));
-  kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
-  kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
+  kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+  kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
 
   T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
   T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
@@ -1612,8 +1614,7 @@
 
 #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)                         \
   OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
-  OUTPUT[INDEX] =                                                           \
-      _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
   __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
@@ -1623,23 +1624,15 @@
 
   PacketBlock<Packet4d, 8> tmp;
 
-  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
-                                         _mm512_extractf64x4_pd(T2, 0), 0x20);
-  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
-                                         _mm512_extractf64x4_pd(T3, 0), 0x20);
-  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
-                                         _mm512_extractf64x4_pd(T2, 0), 0x31);
-  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
-                                         _mm512_extractf64x4_pd(T3, 0), 0x31);
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
 
-  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
-                                         _mm512_extractf64x4_pd(T2, 1), 0x20);
-  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
-                                         _mm512_extractf64x4_pd(T3, 1), 0x20);
-  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
-                                         _mm512_extractf64x4_pd(T2, 1), 0x31);
-  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
-                                         _mm512_extractf64x4_pd(T3, 1), 0x31);
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
 
   PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
   PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
@@ -1648,64 +1641,66 @@
 }
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
-    __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0],kernel.packet[1]);
-    __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0],kernel.packet[1]);
-    __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2],kernel.packet[3]);
-    __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2],kernel.packet[3]);
-    __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4],kernel.packet[5]);
-    __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4],kernel.packet[5]);
-    __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6],kernel.packet[7]);
-    __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6],kernel.packet[7]);
+  __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
+  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
 
-    kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
-    kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
-    kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
-    kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
-    kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
-    kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
-    kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
-    kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
-    kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
-    kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
-    kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
-    kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
-    kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
-    kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
-    kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
-    kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
+  kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
+  kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
+  kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
+  kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
+  kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
+  kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
+  kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
+  kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
+  kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
+  kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
+  kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
+  kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
+  kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
+  kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
+  kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
+  kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
 
-    T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
-    T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
-    T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
-    T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
-    T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
-    T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
-    T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
-    T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
-    T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
-    T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
-    T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
-    T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
-    T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
-    T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
-    T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
-    T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
+  T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
+  T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
+  T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
+  T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
+  T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
+  T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
+  T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
+  T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
+  T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
+  T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
+  T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
+  T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
+  T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
+  T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
+  T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
+  T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
 
-    kernel.packet[0] = T0; kernel.packet[1] = T1;
-    kernel.packet[2] = T2; kernel.packet[3] = T3;
-    kernel.packet[4] = T4; kernel.packet[5] = T5;
-    kernel.packet[6] = T6; kernel.packet[7] = T7;
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
 }
 
 #define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
   EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
 
-#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE)     \
-  EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], \
-                           INPUT[2 * INDEX + STRIDE]);
+#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
 
-#define SHUFFLE_EPI32(A, B, M) \
-  _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
+#define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
   __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
@@ -1854,8 +1849,7 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket,
-                                     const Packet16f& thenPacket,
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
                                      const Packet16f& elsePacket) {
   __mmask16 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
                 (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
@@ -1866,51 +1860,51 @@
   return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
-                                    const Packet8d& thenPacket,
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
                                     const Packet8d& elsePacket) {
-  __mmask8 m = (ifPacket.select[0]   )
-             | (ifPacket.select[1]<<1)
-             | (ifPacket.select[2]<<2)
-             | (ifPacket.select[3]<<3)
-             | (ifPacket.select[4]<<4)
-             | (ifPacket.select[5]<<5)
-             | (ifPacket.select[6]<<6)
-             | (ifPacket.select[7]<<7);
+  __mmask8 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
+               (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
+               (ifPacket.select[6] << 6) | (ifPacket.select[7] << 7);
   return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }
 
 // Packet math for Eigen::half
-template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
   return _mm256_set1_epi16(from.x);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
   return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
   return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
   return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
   _mm256_store_si256((__m256i*)(void*)to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
   // (void*) -> workaround clang warning:
   // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
   _mm256_storeu_si256((__m256i*)(void*)to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h
-ploaddup<Packet16h>(const Eigen::half*  from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
   unsigned short a = from[0].x;
   unsigned short b = from[1].x;
   unsigned short c = from[2].x;
@@ -1922,8 +1916,8 @@
   return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h
-ploadquad(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) {
   unsigned short a = from[0].x;
   unsigned short b = from[1].x;
   unsigned short c = from[2].x;
@@ -1931,15 +1925,14 @@
   return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
 }
 
-EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
-  return _mm512_cvtph_ps(a);
-}
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtph_ps(a); }
 
 EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
-  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
   return Packet16h(ptrue(Packet8i(a)));
 }
 
@@ -1950,14 +1943,12 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
-                                              const Packet16h& b) {
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
   return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
-                                              const Packet16h& b) {
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
   return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
 }
 
@@ -1966,96 +1957,118 @@
   return float2half(plset<Packet16f>(static_cast<float>(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
   // in some cases Packet8i is a wrapper around __m256i, so we need to
   // cast to Packet8i to call the correct overload.
-  return Packet16h(por(Packet8i(a),Packet8i(b)));
+  return Packet16h(por(Packet8i(a), Packet8i(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
-  return Packet16h(pxor(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pxor(Packet8i(a), Packet8i(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
-  return Packet16h(pand(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pand(Packet8i(a), Packet8i(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
-  return Packet16h(pandnot(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
   return _mm256_blendv_epi8(b, a, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
   return float2half(pround<Packet16f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
   return float2half(print<Packet16f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
   return float2half(pceil<Packet16f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
   return float2half(pfloor<Packet16f>(half2float(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
   return Pack32To16(pcmp_eq(af, bf));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
   return Pack32To16(pcmp_le(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
   return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
   return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
   Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
   return _mm256_xor_si256(a, sign_mask);
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
   Packet16f rf = padd(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
   Packet16f rf = psub(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
   Packet16f rf = pmul(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
   Packet16f af = half2float(a);
   Packet16f bf = half2float(b);
   Packet16f rf = pdiv(af, bf);
   return float2half(rf);
 }
 
-template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
   Packet16f from_float = half2float(from);
   return half(predux(from_float));
 }
@@ -2069,64 +2082,64 @@
   return padd<Packet8h>(lane0, lane1);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
   Packet16f af = half2float(a);
   float reduced = predux_max<Packet16f>(af);
   return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
   Packet16f af = half2float(a);
   float reduced = predux_min<Packet16f>(af);
   return Eigen::half(reduced);
 }
 
-template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
   Packet16f from_float = half2float(from);
   return half(predux_mul(from_float));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  return _mm256_insertf128_si256(
-                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
-                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
+                                 _mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
-{
-  return _mm256_set_epi16(
-      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
-      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
-      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
-      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+template <>
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+  return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                          from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                          from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                          from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
 }
 
-template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
   EIGEN_ALIGN64 half aux[16];
   pstore(aux, from);
-  to[stride*0] = aux[0];
-  to[stride*1] = aux[1];
-  to[stride*2] = aux[2];
-  to[stride*3] = aux[3];
-  to[stride*4] = aux[4];
-  to[stride*5] = aux[5];
-  to[stride*6] = aux[6];
-  to[stride*7] = aux[7];
-  to[stride*8] = aux[8];
-  to[stride*9] = aux[9];
-  to[stride*10] = aux[10];
-  to[stride*11] = aux[11];
-  to[stride*12] = aux[12];
-  to[stride*13] = aux[13];
-  to[stride*14] = aux[14];
-  to[stride*15] = aux[15];
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,16>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
   __m256i a = kernel.packet[0];
   __m256i b = kernel.packet[1];
   __m256i c = kernel.packet[2];
@@ -2233,8 +2246,7 @@
   kernel.packet[15] = a_p_f;
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
   EIGEN_ALIGN64 half in[8][16];
   pstore<half>(in[0], kernel.packet[0]);
   pstore<half>(in[1], kernel.packet[1]);
@@ -2249,10 +2261,10 @@
 
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
-      out[i][j] = in[j][2*i];
+      out[i][j] = in[j][2 * i];
     }
     for (int j = 0; j < 8; ++j) {
-      out[i][j+8] = in[j][2*i+1];
+      out[i][j + 8] = in[j][2 * i + 1];
     }
   }
 
@@ -2266,8 +2278,7 @@
   kernel.packet[7] = pload<Packet16h>(out[7]);
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
   EIGEN_ALIGN64 half in[4][16];
   pstore<half>(in[0], kernel.packet[0]);
   pstore<half>(in[1], kernel.packet[1]);
@@ -2278,16 +2289,16 @@
 
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][4*i];
+      out[i][j] = in[j][4 * i];
     }
     for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][4*i+1];
+      out[i][j + 4] = in[j][4 * i + 1];
     }
     for (int j = 0; j < 4; ++j) {
-      out[i][j+8] = in[j][4*i+2];
+      out[i][j + 8] = in[j][4 * i + 2];
     }
     for (int j = 0; j < 4; ++j) {
-      out[i][j+12] = in[j][4*i+3];
+      out[i][j + 12] = in[j][4 * i + 3];
     }
   }
 
@@ -2297,7 +2308,10 @@
   kernel.packet[3] = pload<Packet16h>(out[3]);
 }
 
-template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet16bf> {
+  enum { value = true };
+};
 
 template <>
 struct packet_traits<bfloat16> : default_packet_traits {
@@ -2315,24 +2329,29 @@
     HasRsqrt = 1,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,  // Currently fails test with bad accuracy.
-    HasLog1p  = 1,
-    HasExpm1  = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
     HasNdtri = 1,
     HasBessel = 1,
 #endif
     HasExp = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1
   };
 };
 
 template <>
-struct unpacket_traits<Packet16bf>
-{
+struct unpacket_traits<Packet16bf> {
   typedef bfloat16 type;
-  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet8bf half;
 };
 
@@ -2359,19 +2378,17 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
-                                          const Packet16bf& from) {
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
   _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
-                                           const Packet16bf& from) {
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
   _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf
-ploaddup<Packet16bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(const bfloat16* from) {
   unsigned short a = from[0].value;
   unsigned short b = from[1].value;
   unsigned short c = from[2].value;
@@ -2383,8 +2400,8 @@
   return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf
-ploadquad(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) {
   unsigned short a = from[0].value;
   unsigned short b = from[1].value;
   unsigned short c = from[2].value;
@@ -2400,7 +2417,7 @@
 EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
   Packet16bf r;
 
-#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10,1,0)
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
   // Since GCC 10.1 supports avx512bf16 and C style explicit cast
   // (C++ static_cast is not supported yet), do conversion via intrinsic
   // and register path for performance.
@@ -2426,7 +2443,7 @@
   t = _mm512_mask_blend_epi32(mask, nan, t);
   // output.value = static_cast<uint16_t>(input);
   r = _mm512_cvtepi32_epi16(t);
-#endif // EIGEN_VECTORIZE_AVX512BF16
+#endif  // EIGEN_VECTORIZE_AVX512BF16
 
   return r;
 }
@@ -2452,58 +2469,54 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
-                                       const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) {
   return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
-                                       const Packet16bf& a,
-                                       const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, const Packet16bf& a, const Packet16bf& b) {
   // Input mask is expected to be all 0/1, handle it with 8-bit
   // intrinsic for performance.
   return _mm256_blendv_epi8(b, a, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a) {
   return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
   return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
   return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
   return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
-                                       const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
   return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
-                                       const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, const Packet16bf& b) {
   return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
-                                       const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, const Packet16bf& b) {
   return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
-                                              const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet16bf& b) {
   return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
@@ -2525,38 +2538,32 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
-                                                const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
   return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
@@ -2594,8 +2601,8 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
-  __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
-                               14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+  __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
+                               4, 5, 2, 3, 0, 1);
 
   Packet16bf res;
   // Swap hi and lo first because shuffle is in 128-bit lanes.
@@ -2605,40 +2612,37 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
-                                                             Index stride) {
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from, Index stride) {
   return _mm256_set_epi16(
-      from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
-      from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
-      from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
-      from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+      from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
+      from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
+      from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
+      from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
-                                                        const Packet16bf& from,
-                                                        Index stride) {
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to, const Packet16bf& from, Index stride) {
   EIGEN_ALIGN64 bfloat16 aux[16];
   pstore(aux, from);
-  to[stride*0] = aux[0];
-  to[stride*1] = aux[1];
-  to[stride*2] = aux[2];
-  to[stride*3] = aux[3];
-  to[stride*4] = aux[4];
-  to[stride*5] = aux[5];
-  to[stride*6] = aux[6];
-  to[stride*7] = aux[7];
-  to[stride*8] = aux[8];
-  to[stride*9] = aux[9];
-  to[stride*10] = aux[10];
-  to[stride*11] = aux[11];
-  to[stride*12] = aux[12];
-  to[stride*13] = aux[13];
-  to[stride*14] = aux[14];
-  to[stride*15] = aux[15];
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
   __m256i a = kernel.packet[0];
   __m256i b = kernel.packet[1];
   __m256i c = kernel.packet[2];
@@ -2728,7 +2732,7 @@
   kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
   __m256i a = kernel.packet[0];
   __m256i b = kernel.packet[1];
   __m256i c = kernel.packet[2];
@@ -2751,8 +2755,8 @@
   kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_AVX512_H
+#endif  // EIGEN_PACKET_MATH_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index faa3853..131e6f1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_PACKET_MATH_FP16_AVX512_H

 #define EIGEN_PACKET_MATH_FP16_AVX512_H

 

-// IWYU pragma: private
+// IWYU pragma: private

 #include "../../InternalHeaderCheck.h"

 

 namespace Eigen {

diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index a3025ec..903bca5 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -108,7 +108,7 @@
   int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
   return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
 }
-#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
+#else  // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
 #define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0
 #define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
 #define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
@@ -118,8 +118,8 @@
  * Used by gemmKernel for the case A/B row-major and C col-major.
  */
 template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
-EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
-                                            Scalar *C_arr, int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
+EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, Scalar *C_arr,
+                                     int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
   EIGEN_UNUSED_VARIABLE(remN_);
   EIGEN_UNUSED_VARIABLE(remM_);
   using urolls = unrolls::trans<Scalar>;
@@ -811,7 +811,7 @@
  */
 template <typename Scalar, bool toTemp = true, bool remM = false>
 EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
-                                                int64_t remM_ = 0) {
+                                         int64_t remM_ = 0) {
   EIGEN_UNUSED_VARIABLE(remM_);
   using urolls = unrolls::transB<Scalar>;
   using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
@@ -1062,7 +1062,8 @@
 // Template specializations of trsmKernelL/R for float/double and inner strides of 1.
 #if (EIGEN_USE_AVX512_TRSM_KERNELS)
 #if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride, bool Specialized>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
 struct trsmKernelR;
 
 template <typename Index, int Mode, int TriStorageOrder>
@@ -1085,7 +1086,7 @@
 #ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
-          size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
     return;
   }
 #endif
@@ -1101,7 +1102,7 @@
 #ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
-          size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
     return;
   }
 #endif
@@ -1112,7 +1113,8 @@
 
 // These trsm kernels require temporary memory allocation
 #if (EIGEN_USE_AVX512_TRSM_L_KERNELS)
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride, bool Specialized = true>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized = true>
 struct trsmKernelL;
 
 template <typename Index, int Mode, int TriStorageOrder>
@@ -1135,7 +1137,7 @@
 #ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
-          size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
     return;
   }
 #endif
@@ -1151,7 +1153,7 @@
 #ifdef EIGEN_RUNTIME_NO_MALLOC
   if (!is_malloc_allowed()) {
     trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
-          size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
     return;
   }
 #endif
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 5053230..56a94f4 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -17,161 +17,207 @@
 
 namespace internal {
 
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
 
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
 
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
 
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
 
-template<> struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
-template<> struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
 
-template<> struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
-template<> struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
 
-template<> EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
   __mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a));
   return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16b, Packet16f>(const Packet16b& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16b, Packet16f>(const Packet16b& a) {
   return _mm512_cvtepi32_ps(_mm512_and_si512(_mm512_cvtepi8_epi32(a), _mm512_set1_epi32(1)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
   return _mm512_cvttps_epi32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
   return _mm512_cvtps_pd(_mm512_castps512_ps256(a));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
   return _mm512_cvtps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
   return _mm512_cvtepi32_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet16i, Packet8d>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16i, Packet8d>(const Packet16i& a) {
   return _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
   return _mm512_cvtepi32_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
-  return  cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
+  return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
-  return  cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
+  return cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8d, Packet8i>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8d, Packet8i>(const Packet8d& a) {
   return _mm512_cvtpd_epi32(a);
 }
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8d, Packet8f>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8d, Packet8f>(const Packet8d& a) {
   return _mm512_cvtpd_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
   return _mm512_castps_si512(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
   return _mm512_castsi512_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f& a) {
   return _mm512_castps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
   return _mm512_castpd_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet16f>(const Packet16f& a) {
   return _mm512_castps512_ps256(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
   return _mm512_castps512_ps128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
   return _mm512_castpd512_pd256(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
   return _mm512_castpd512_pd128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
   return _mm512_castps256_ps512(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
   return _mm512_castps128_ps512(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
   return _mm512_castpd256_pd512(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
   return _mm512_castpd128_pd512(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet16i>(const Packet16i& a) {
   return _mm512_castsi512_si256(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i& a) {
   return _mm512_castsi512_si128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
   return _mm256_castsi256_si128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
   return _mm256_castsi256_si128(a);
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512FP16
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
   return half2float(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
   return float2half(a);
 }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
   return Bf16ToF32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
   return F32ToBf16(a);
 }
 
 #ifdef EIGEN_VECTORIZE_AVX512FP16
 
-template<> EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
   return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
 }
-template<> EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
   return _mm256_castsi256_si128(preinterpret<Packet16h>(a));
 }
 
@@ -182,12 +228,13 @@
   return _mm512_cvtxph_ps(_mm256_castsi256_ph(low));
 }
 
-
 template <>
 EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
   __m512d result = _mm512_undefined_pd();
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0);
-  result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1);
+  result = _mm512_insertf64x4(
+      result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
+  result = _mm512_insertf64x4(
+      result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
   return _mm512_castpd_ph(result);
 }
 
@@ -198,12 +245,13 @@
   return _mm256_cvtxph_ps(_mm_castsi128_ph(low));
 }
 
-
 template <>
 EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
   __m256d result = _mm256_undefined_pd();
-  result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0);
-  result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1);
+  result = _mm256_insertf64x2(result,
+                              _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
+  result = _mm256_insertf64x2(result,
+                              _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
   return _mm256_castpd_si256(result);
 }
 
@@ -214,7 +262,6 @@
   return _mm256_extractf32x4_ps(full, 0);
 }
 
-
 template <>
 EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
   __m256 result = _mm256_undefined_ps();
@@ -223,11 +270,10 @@
   return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT);
 }
 
-
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_AVX512_H
+#endif  // EIGEN_TYPE_CASTING_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 915b01b..7bfc61d 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -18,25 +18,28 @@
 
 namespace internal {
 
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet4ui p4ui_CONJ_XOR =
+    vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);  //{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 #ifdef EIGEN_VECTORIZE_VSX
 #if defined(_BIG_ENDIAN)
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+    (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
 #else
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+    (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
 #endif
 #endif
 
 //---------- float ----------
-struct Packet2cf
-{
+struct Packet2cf {
   EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
-  {
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
     Packet4f v1, v2;
 
     // Permute and multiply the real parts of a and b
@@ -58,33 +61,25 @@
     v = pmul(Packet2cf(*this), b).v;
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
-    return Packet2cf(*this) *= b;
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
 
   EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
     v = padd(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
-    return Packet2cf(*this) += b;
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
   EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
     v = psub(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
-    return Packet2cf(*this) -= b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
-    return Packet2cf(-v);
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(-v); }
 
-  Packet4f  v;
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
   typedef Packet2cf half;
   typedef Packet4f as_real;
@@ -93,160 +88,232 @@
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSqrt   = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSqrt = 1,
 #ifdef EIGEN_VECTORIZE_VSX
-    HasBlend  = 1,
+    HasBlend = 1,
 #endif
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   Packet2cf res;
 #ifdef EIGEN_VECTORIZE_VSX
   // Load a single std::complex<float> from memory and duplicate
   //
   // Using pload would read past the end of the reference in this case
   // Using vec_xl_len + vec_splat, generates poor assembly
-  __asm__ ("lxvdsx %x0,%y1" : "=wa" (res.v) : "Z" (from));
+  __asm__("lxvdsx %x0,%y1" : "=wa"(res.v) : "Z"(from));
 #else
-  if((std::ptrdiff_t(&from) % 16) == 0)
-    res.v = pload<Packet4f>((const float *)&from);
+  if ((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float*)&from);
   else
-    res.v = ploadu<Packet4f>((const float *)&from);
+    res.v = ploadu<Packet4f>((const float*)&from);
   res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
 #endif
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>*        from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>*       from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
-template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
-{
-  return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  return Packet2cf(pload<Packet4f>((const float*)from));
 }
-template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
-{
-  return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2, offset * 2));
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  return Packet2cf(ploadu<Packet4f>((const float*)from));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from) { return pset1<Packet2cf>(*from); }
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+                                                       const Index offset) {
+  return Packet2cf(pload_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+                                                        const Index offset) {
+  return Packet2cf(ploadu_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
-template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> *  to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> *  to, const Packet2cf& from, const Index n, const Index offset) { pstoreu_partial((float*)to, from.v, n * 2, offset * 2); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+                                                              const Index n, const Index offset) {
+  pstore_partial((float*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+                                                               const Index n, const Index offset) {
+  pstoreu_partial((float*)to, from.v, n * 2, offset * 2);
+}
 
-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
-{
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1) {
   Packet4f res0, res1;
 #ifdef EIGEN_VECTORIZE_VSX
   // Load two std::complex<float> from memory and combine
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
+  __asm__("lxsdx %x0,%y1" : "=wa"(res0) : "Z"(from0));
+  __asm__("lxsdx %x0,%y1" : "=wa"(res1) : "Z"(from1));
 #ifdef _BIG_ENDIAN
-  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+  __asm__("xxpermdi %x0, %x1, %x2, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
 #else
-  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+  __asm__("xxpermdi %x0, %x2, %x1, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
 #endif
 #else
-  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
-  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
+  *reinterpret_cast<std::complex<float>*>(&res0) = from0;
+  *reinterpret_cast<std::complex<float>*>(&res1) = from1;
   res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
 #endif
   return Packet2cf(res0);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>*     from)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from) {
   Packet2cf res;
   res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
   return res;
 }
 
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
-{
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride,
+                                                                   const Index n = 2) {
   eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
   EIGEN_ALIGN16 Scalar af[2];
   for (Index i = 0; i < n; i++) {
-    af[i] = from[i*stride];
+    af[i] = from[i * stride];
   }
   return pload_ignore<Packet>(af);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                                        Index stride) {
   return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf
+pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n) {
   return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
 }
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
-{
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride,
+                                                                  const Index n = 2) {
   eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
   EIGEN_ALIGN16 Scalar af[2];
-  pstore<Scalar>((Scalar *) af, from);
+  pstore<Scalar>((Scalar*)af, from);
   for (Index i = 0; i < n; i++) {
-    to[i*stride] = af[i];
+    to[i * stride] = af[i];
   }
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                                    const Packet2cf& from,
+                                                                                    Index stride) {
   pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                                            const Packet2cf& from,
+                                                                                            Index stride,
+                                                                                            const Index n) {
   pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr)    { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
   EIGEN_ALIGN16 std::complex<float> res[2];
-  pstore((float *)&res, a.v);
+  pstore((float*)&res, a.v);
 
   return res[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
   Packet4f rev_a;
   rev_a = vec_sld(a.v, a.v, 8);
   return Packet2cf(rev_a);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
   Packet4f b;
   b = vec_sld(a.v, a.v, 8);
   b = padd<Packet4f>(a.v, b);
   return pfirst<Packet2cf>(Packet2cf(b));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   Packet4f b;
   Packet2cf prod;
   b = vec_sld(a.v, a.v, 8);
@@ -255,23 +322,24 @@
   return pfirst<Packet2cf>(prod);
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
   return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
 #ifdef EIGEN_VECTORIZE_VSX
-  Packet4f tmp = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
-  kernel.packet[1].v = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+  Packet4f tmp = reinterpret_cast<Packet4f>(
+      vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+  kernel.packet[1].v = reinterpret_cast<Packet4f>(
+      vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
 #else
   Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
@@ -279,33 +347,35 @@
   kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
-  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v, b.v));
   return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
 
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
   Packet2cf result;
-  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  result.v = reinterpret_cast<Packet4f>(
+      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
   return result;
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
   return psqrt_complex<Packet2cf>(a);
 }
 
 //---------- double ----------
 #ifdef EIGEN_VECTORIZE_VSX
-struct Packet1cd
-{
+struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
-  {
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
     Packet2d a_re, a_im, v1, v2;
 
     // Permute and multiply the real parts of a and b
@@ -326,33 +396,25 @@
     v = pmul(Packet1cd(*this), b).v;
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
-    return Packet1cd(*this) *= b;
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
 
   EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
     v = padd(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
-    return Packet1cd(*this) += b;
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
   EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
     v = psub(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
-    return Packet1cd(*this) -= b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
-    return Packet1cd(-v);
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(-v); }
 
   Packet2d v;
 };
 
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet1cd type;
   typedef Packet1cd half;
   typedef Packet2d as_real;
@@ -361,123 +423,204 @@
     AlignedOnScalar = 0,
     size = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSqrt   = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSqrt = 1,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+};
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+                                                       const Index offset) {
   return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
 }
-template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+                                                        const Index offset) {
   return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2, offset * 2));
 }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
-template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> *  to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> *  to, const Packet1cd& from, const Index n, const Index offset) { pstoreu_partial((double*)to, from.v, n * 2, offset * 2); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  pstoreu((double*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+                                                               const Index n, const Index offset) {
+  pstore_partial((double*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+                                                                const Index n, const Index offset) {
+  pstoreu_partial((double*)to, from.v, n * 2, offset * 2);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index) {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index) {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                                     const Packet1cd& from, Index) {
   pstore<std::complex<double> >(to, from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                                             const Packet1cd& from,
+                                                                                             Index, const Index) {
   pstore<std::complex<double> >(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pandnot(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)  { return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr)    { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 std::complex<double> res[1];
   pstore<std::complex<double> >(res, a);
 
   return res[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   return pdiv_complex(a, b);
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
   return Packet1cd(preverse(Packet2d(x.v)));
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
   Packet2d tmp = vec_mergeh(kernel.packet[0].v, kernel.packet[1].v);
   kernel.packet[1].v = vec_mergel(kernel.packet[0].v, kernel.packet[1].v);
   kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
   // Compare real and imaginary parts of a and b to get the mask vector:
   // [re(a)==re(b), im(a)==im(b)]
-  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
+  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v, b.v));
   // Swap real/imag elements in the mask in to get:
   // [im(a)==im(b), re(a)==re(b)]
-  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+  Packet2d eq_swapped =
+      reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
   // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
   return Packet1cd(vec_and(eq, eq_swapped));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
   return psqrt_complex<Packet1cd>(a);
 }
 
-#endif // __VSX__
-} // end namespace internal
+#endif  // __VSX__
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX32_ALTIVEC_H
+#endif  // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index a8a2309..c95ee38 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -25,50 +25,47 @@
 #endif
 
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
-  return  vec_sqrt(x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  return vec_sqrt(x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x)
-{
-  return  vec_sqrt(x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+  return vec_sqrt(x);
 }
 
 #if !EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
-//  vec_rsqrt returns different results from the generic version
-//  return  vec_rsqrt(x);
+  //  vec_rsqrt returns different results from the generic version
+  //  return  vec_rsqrt(x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
   return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
-//  vec_rsqrt returns different results from the generic version
-//  return  vec_rsqrt(x);
+  //  vec_rsqrt returns different results from the generic version
+  //  return  vec_rsqrt(x);
 }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
 }
 
 #if !EIGEN_COMP_CLANG
-template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
 }
 #endif
 #else
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
   Packet4f a;
   for (Index i = 0; i < packet_traits<float>::size; i++) {
     a[i] = numext::sqrt(x[i]);
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index e9a9307..94306da 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -12,17 +12,17 @@
 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
 
 #ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
-#define EIGEN_ALTIVEC_USE_CUSTOM_PACK    1
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
 #endif
 
 #if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
 #define EIGEN_ALTIVEC_DISABLE_MMA 0
 #endif
 
-// Check for MMA builtin support. 
+// Check for MMA builtin support.
 #if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
 #if __has_builtin(__builtin_mma_assemble_acc)
-  #define EIGEN_ALTIVEC_MMA_SUPPORT
+#define EIGEN_ALTIVEC_MMA_SUPPORT
 #endif
 #endif
 
@@ -41,12 +41,12 @@
 #define EIGEN_ALTIVEC_MMA_ONLY 1
 #endif
 
-#endif // EIGEN_ALTIVEC_MMA_SUPPORT
+#endif  // EIGEN_ALTIVEC_MMA_SUPPORT
 
 #include "MatrixProductCommon.h"
 
 #if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
-  #include "MatrixProductMMA.h"
+#include "MatrixProductMMA.h"
 #endif
 
 // IWYU pragma: private
@@ -59,71 +59,41 @@
 /**************************
  * Constants and typedefs *
  **************************/
-template<typename Scalar>
-struct quad_traits
-{
-  typedef typename packet_traits<Scalar>::type    vectortype;
-  typedef PacketBlock<vectortype,4>                     type;
-  typedef vectortype                                 rhstype;
-  enum
-  {
-    vectorsize = packet_traits<Scalar>::size,
-    size = 4,
-    rows = 4
-  };
+template <typename Scalar>
+struct quad_traits {
+  typedef typename packet_traits<Scalar>::type vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef vectortype rhstype;
+  enum { vectorsize = packet_traits<Scalar>::size, size = 4, rows = 4 };
 };
 
-template<>
-struct quad_traits<double>
-{
-  typedef Packet2d                        vectortype;
-  typedef PacketBlock<vectortype,4>             type;
-  typedef PacketBlock<Packet2d,2>            rhstype;
-  enum
-  {
-    vectorsize = packet_traits<double>::size,
-    size = 2,
-    rows = 4
-  };
+template <>
+struct quad_traits<double> {
+  typedef Packet2d vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef PacketBlock<Packet2d, 2> rhstype;
+  enum { vectorsize = packet_traits<double>::size, size = 2, rows = 4 };
 };
 
-template<>
-struct quad_traits<bfloat16>
-{
-  typedef Packet8bf                       vectortype;
-  typedef PacketBlock<vectortype,4>             type;
-  typedef vectortype                         rhstype;
-  enum
-  {
-    vectorsize = packet_traits<bfloat16>::size,
-    size = 8,
-    rows = 4
-  };
+template <>
+struct quad_traits<bfloat16> {
+  typedef Packet8bf vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef vectortype rhstype;
+  enum { vectorsize = packet_traits<bfloat16>::size, size = 8, rows = 4 };
 };
 
 // MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
 // to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
 // are responsible to extract from convert between Eigen's and MatrixProduct approach.
 
-const static Packet16uc p16uc_GETREAL32 = {  0,  1,  2,  3,
-                                             8,  9, 10, 11,
-                                            16, 17, 18, 19,
-                                            24, 25, 26, 27};
+const static Packet16uc p16uc_GETREAL32 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
 
-const static Packet16uc p16uc_GETIMAG32 = {  4,  5,  6,  7,
-                                            12, 13, 14, 15,
-                                            20, 21, 22, 23,
-                                            28, 29, 30, 31};
+const static Packet16uc p16uc_GETIMAG32 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
 
-const static Packet16uc p16uc_GETREAL32b = {  0,  1,  2,  3,
-                                             16, 17, 18, 19,
-                                              8,  9, 10, 11,
-                                             24, 25, 26, 27};
+const static Packet16uc p16uc_GETREAL32b = {0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27};
 
-const static Packet16uc p16uc_GETIMAG32b = {  4,  5,  6,  7,
-                                             20, 21, 22, 23,
-                                             12, 13, 14, 15,
-                                             28, 29, 30, 31};
+const static Packet16uc p16uc_GETIMAG32b = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
 
 /*********************************************
  * Single precision real and complex packing *
@@ -131,55 +101,50 @@
 
 /**
  * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
- * the diagonal real, whatever is below it is copied from the respective upper diagonal element and 
+ * the diagonal real, whatever is below it is copied from the respective upper diagonal element and
  * conjugated. There's no PanelMode available for symm packing.
  *
- * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using 
+ * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using
  * its respective rank-update instructions. The float32/64 versions are different because at this moment
  * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
- * 
+ *
  * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
  * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
  * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
  * float32/64 and complex float32/64 version.
  **/
-template<typename Scalar, int StorageOrder>
-EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(
+    Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt) {
   std::complex<Scalar> v;
-  if(i < j)
-  {
-    v.real( dt(j,i).real());
-    v.imag(-dt(j,i).imag());
-  } else if(i > j)
-  {
-    v.real( dt(i,j).real());
-    v.imag( dt(i,j).imag());
+  if (i < j) {
+    v.real(dt(j, i).real());
+    v.imag(-dt(j, i).imag());
+  } else if (i > j) {
+    v.real(dt(i, j).real());
+    v.imag(dt(i, j).imag());
   } else {
-    v.real( dt(i,j).real());
+    v.real(dt(i, j).real());
     v.imag((Scalar)0.0);
   }
   return v;
 }
 
-template<typename Scalar, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-{
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs,
+                                                      Index rhsStride, Index rows, Index cols, Index k2) {
   const Index depth = k2 + rows;
   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
-  const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
+  const Index vectorSize = N * quad_traits<Scalar>::vectorsize;
   const Index vectorDelta = vectorSize * rows;
-  Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
+  Scalar* blockBf = reinterpret_cast<Scalar*>(blockB);
 
   Index rir = 0, rii, j = 0;
-  for(; j + vectorSize <= cols; j+=vectorSize)
-  {
+  for (; j + vectorSize <= cols; j += vectorSize) {
     rii = rir + vectorDelta;
 
-    for(Index i = k2; i < depth; i++)
-    {
-      for(Index k = 0; k < vectorSize; k++)
-      {
+    for (Index i = k2; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
         std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j + k, rhs);
 
         blockBf[rir + k] = v.real();
@@ -192,12 +157,10 @@
     rir += vectorDelta;
   }
 
-  for(; j < cols; j++)
-  {
+  for (; j < cols; j++) {
     rii = rir + rows;
 
-    for(Index i = k2; i < depth; i++)
-    {
+    for (Index i = k2; i < depth; i++) {
       std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j, rhs);
 
       blockBf[rir] = v.real();
@@ -211,25 +174,22 @@
   }
 }
 
-template<typename Scalar, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs,
+                                                      Index lhsStride, Index cols, Index rows) {
   const Index depth = cols;
   const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
   const Index vectorSize = quad_traits<Scalar>::vectorsize;
   const Index vectorDelta = vectorSize * depth;
-  Scalar* blockAf = reinterpret_cast<Scalar *>(blockA);
+  Scalar* blockAf = reinterpret_cast<Scalar*>(blockA);
 
   Index rir = 0, rii, j = 0;
-  for(; j + vectorSize <= rows; j+=vectorSize)
-  {
+  for (; j + vectorSize <= rows; j += vectorSize) {
     rii = rir + vectorDelta;
 
-    for(Index i = 0; i < depth; i++)
-    {
-      for(Index k = 0; k < vectorSize; k++)
-      {
-        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j+k, i, lhs);
+    for (Index i = 0; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j + k, i, lhs);
 
         blockAf[rir + k] = v.real();
         blockAf[rii + k] = v.imag();
@@ -241,15 +201,12 @@
     rir += vectorDelta;
   }
 
-  if (j < rows)
-  {
+  if (j < rows) {
     rii = rir + ((rows - j) * depth);
 
-    for(Index i = 0; i < depth; i++)
-    {
+    for (Index i = 0; i < depth; i++) {
       Index k = j;
-      for(; k < rows; k++)
-      {
+      for (; k < rows; k++) {
         std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(k, i, lhs);
 
         blockAf[rir] = v.real();
@@ -262,35 +219,30 @@
   }
 }
 
-template<typename Scalar, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-{
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows,
+                                              Index cols, Index k2) {
   const Index depth = k2 + rows;
   const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
   const Index vectorSize = quad_traits<Scalar>::vectorsize;
 
   Index ri = 0, j = 0;
-  for(; j + N*vectorSize <= cols; j+=N*vectorSize)
-  {
+  for (; j + N * vectorSize <= cols; j += N * vectorSize) {
     Index i = k2;
-    for(; i < depth; i++)
-    {
-      for(Index k = 0; k < N*vectorSize; k++)
-      {
-        if(i <= j+k)
-          blockB[ri + k] = rhs(j+k, i);
+    for (; i < depth; i++) {
+      for (Index k = 0; k < N * vectorSize; k++) {
+        if (i <= j + k)
+          blockB[ri + k] = rhs(j + k, i);
         else
-          blockB[ri + k] = rhs(i, j+k);
+          blockB[ri + k] = rhs(i, j + k);
       }
-      ri += N*vectorSize;
+      ri += N * vectorSize;
     }
   }
 
-  for(; j < cols; j++)
-  {
-    for(Index i = k2; i < depth; i++)
-    {
-      if(j <= i)
+  for (; j < cols; j++) {
+    for (Index i = k2; i < depth; i++) {
+      if (j <= i)
         blockB[ri] = rhs(i, j);
       else
         blockB[ri] = rhs(j, i);
@@ -299,39 +251,33 @@
   }
 }
 
-template<typename Scalar, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols,
+                                              Index rows) {
   const Index depth = cols;
   const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
   const Index vectorSize = quad_traits<Scalar>::vectorsize;
 
   Index ri = 0, j = 0;
-  for(; j + vectorSize <= rows; j+=vectorSize)
-  {
+  for (; j + vectorSize <= rows; j += vectorSize) {
     Index i = 0;
 
-    for(; i < depth; i++)
-    {
-      for(Index k = 0; k < vectorSize; k++)
-      {
-        if(i <= j+k)
-          blockA[ri + k] = lhs(j+k, i);
+    for (; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
+        if (i <= j + k)
+          blockA[ri + k] = lhs(j + k, i);
         else
-          blockA[ri + k] = lhs(i, j+k);
+          blockA[ri + k] = lhs(i, j + k);
       }
       ri += vectorSize;
     }
   }
 
-  if (j < rows)
-  {
-    for(Index i = 0; i < depth; i++)
-    {
+  if (j < rows) {
+    for (Index i = 0; i < depth; i++) {
       Index k = j;
-      for(; k < rows; k++)
-      {
-        if(i <= k)
+      for (; k < rows; k++) {
+        if (i <= k)
           blockA[ri] = lhs(k, i);
         else
           blockA[ri] = lhs(i, k);
@@ -341,85 +287,73 @@
   }
 }
 
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
-{
-  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-  {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder> {
+  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols,
+                  Index k2) {
     symm_pack_complex_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
-{
-  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
-  {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols,
+                  Index rows) {
     symm_pack_complex_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
 // *********** symm_pack std::complex<float64> ***********
 
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
-{
-  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-  {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder> {
+  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows,
+                  Index cols, Index k2) {
     symm_pack_complex_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
-{
-  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
-  {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols,
+                  Index rows) {
     symm_pack_complex_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
 // *********** symm_pack float32 ***********
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<float, Index, nr, StorageOrder>
-{
-  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-  {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<float, Index, nr, StorageOrder> {
+  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
     symm_pack_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
-{
-  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
-  {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) {
     symm_pack_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
 // *********** symm_pack float64 ***********
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<double, Index, nr, StorageOrder>
-{
-  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-  {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<double, Index, nr, StorageOrder> {
+  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
     symm_pack_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
   }
 };
 
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
-{
-  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
-  {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) {
     symm_pack_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
   }
 };
 
 /**
  * PanelMode
- * Packing might be called several times before being multiplied by gebp_kernel, this happens because 
+ * Packing might be called several times before being multiplied by gebp_kernel, this happens because
  * on special occasions it fills part of block with other parts of the matrix. Two variables control
  * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
  * is going to be the real offset and stride in the future and this is what you should obey. The process
@@ -428,9 +362,8 @@
  * and offset and behaves accordingly.
  **/
 
-template<typename Scalar, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
-{
+template <typename Scalar, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet, N>& block) {
   const Index size = 16 / sizeof(Scalar);
   pstore<Scalar>(to + (0 * size), block.packet[0]);
   pstore<Scalar>(to + (1 * size), block.packet[1]);
@@ -443,11 +376,12 @@
 }
 
 // General template for lhs & rhs complex packing.
-template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+template <typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate,
+          bool PanelMode, bool UseLhs>
 struct dhs_cpack {
-  template<bool transpose>
-  EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC,8>& cblock, PacketBlock<Packet,4>& block, Packet16uc permute)
-  {
+  template <bool transpose>
+  EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC, 8>& cblock, PacketBlock<Packet, 4>& block,
+                                      Packet16uc permute) {
     if (transpose) {
       block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, permute);
       block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, permute);
@@ -456,10 +390,14 @@
 
       Packet4f t0, t1, t2, t3;
 #ifdef EIGEN_VECTORIZE_VSX
-      t0 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
-      t1 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
-      t2 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
-      t3 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+      t0 = reinterpret_cast<Packet>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t1 = reinterpret_cast<Packet>(
+          vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t2 = reinterpret_cast<Packet>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+      t3 = reinterpret_cast<Packet>(
+          vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
 #else
       t0 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_HI));
       t1 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_LO));
@@ -479,21 +417,19 @@
     }
   }
 
-  EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
-  {
-    PacketBlock<Packet,4> blockr, blocki;
-    PacketBlock<PacketC,8> cblock;
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    PacketBlock<Packet, 4> blockr, blocki;
+    PacketBlock<PacketC, 8> cblock;
 
-    for(; i + vectorSize <= depth; i+=vectorSize)
-    {
+    for (; i + vectorSize <= depth; i += vectorSize) {
       if (UseLhs) {
         bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
       } else {
         bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
       }
 
-      if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
-      {
+      if (((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs))) {
         dhs_cblock<true>(cblock, blockr, p16uc_GETREAL32b);
         dhs_cblock<true>(cblock, blocki, p16uc_GETIMAG32b);
       } else {
@@ -501,8 +437,7 @@
         dhs_cblock<false>(cblock, blocki, p16uc_GETIMAG32);
       }
 
-      if(Conjugate)
-      {
+      if (Conjugate) {
         blocki.packet[0] = -blocki.packet[0];
         blocki.packet[1] = -blocki.packet[1];
         blocki.packet[2] = -blocki.packet[2];
@@ -512,21 +447,20 @@
       storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
       storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
 
-      rir += 4*vectorSize;
-      rii += 4*vectorSize;
+      rir += 4 * vectorSize;
+      rii += 4 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                      Index stride, Index offset) {
     const Index vectorSize = quad_traits<Scalar>::vectorsize;
     const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
-    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
-    Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
+    Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+    Scalar* blockAt = reinterpret_cast<Scalar*>(blockA);
     Index j = 0;
 
-    for(; j + vectorSize <= rows; j+=vectorSize)
-    {
+    for (; j + vectorSize <= rows; j += vectorSize) {
       const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
       Index i = 0;
 
@@ -534,13 +468,11 @@
 
       dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
 
-      for(; i < depth; i++)
-      {
-        PacketBlock<Packet,1> blockr, blocki;
-        PacketBlock<PacketC,2> cblock;
+      for (; i < depth; i++) {
+        PacketBlock<Packet, 1> blockr, blocki;
+        PacketBlock<PacketC, 2> cblock;
 
-        if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
-        {
+        if (((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) {
           if (UseLhs) {
             cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
             cblock.packet[1] = lhs2.template loadPacket<PacketC>(2, i);
@@ -561,8 +493,7 @@
         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
         blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
 
-        if(Conjugate)
-        {
+        if (Conjugate) {
           blocki.packet[0] = -blocki.packet[0];
         }
 
@@ -573,50 +504,44 @@
         rii += vectorSize;
       }
 
-      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+      rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
     }
 
-    if (!UseLhs)
-    {
-      if(PanelMode) rir -= (offset*(vectorSize - 1));
+    if (!UseLhs) {
+      if (PanelMode) rir -= (offset * (vectorSize - 1));
 
-      for(; j < rows; j++)
-      {
+      for (; j < rows; j++) {
         const DataMapper lhs2 = lhs.getSubMapper(0, j);
         rii = rir + ((PanelMode) ? stride : depth);
 
-        for(Index i = 0; i < depth; i++)
-        {
+        for (Index i = 0; i < depth; i++) {
           blockAt[rir] = lhs2(i, 0).real();
 
-          if(Conjugate)
+          if (Conjugate)
             blockAt[rii] = -lhs2(i, 0).imag();
           else
-            blockAt[rii] =  lhs2(i, 0).imag();
+            blockAt[rii] = lhs2(i, 0).imag();
 
           rir += 1;
           rii += 1;
         }
 
-        rir += ((PanelMode) ? (2*stride - depth) : depth);
+        rir += ((PanelMode) ? (2 * stride - depth) : depth);
       }
     } else {
-      if (j < rows)
-      {
-        if(PanelMode) rir += (offset*(rows - j - vectorSize));
+      if (j < rows) {
+        if (PanelMode) rir += (offset * (rows - j - vectorSize));
         rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
 
-        for(Index i = 0; i < depth; i++)
-        {
+        for (Index i = 0; i < depth; i++) {
           Index k = j;
-          for(; k < rows; k++)
-          {
+          for (; k < rows; k++) {
             blockAt[rir] = lhs(k, i).real();
 
-            if(Conjugate)
+            if (Conjugate)
               blockAt[rii] = -lhs(k, i).imag();
             else
-              blockAt[rii] =  lhs(k, i).imag();
+              blockAt[rii] = lhs(k, i).imag();
 
             rir += 1;
             rii += 1;
@@ -628,68 +553,63 @@
 };
 
 // General template for lhs & rhs packing.
-template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
-struct dhs_pack{
-  template<Index n>
-  EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
-  {
-    PacketBlock<Packet,4> block[n];
+template <typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet, 4> block[n];
 
-    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
-    {
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
       for (Index k = 0; k < n; k++) {
         if (UseLhs) {
-          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k*vectorSize);
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k * vectorSize);
         } else {
-          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k*vectorSize, 0);
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k * vectorSize, 0);
         }
       }
 
-      if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
-      {
+      if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
         for (Index k = 0; k < n; k++) {
           ptranspose(block[k]);
         }
       }
 
       for (Index k = 0; k < n; k++) {
-        storeBlock<Scalar, Packet, 4>(blockA + ri + k*4*vectorSize, block[k]);
+        storeBlock<Scalar, Packet, 4>(blockA + ri + k * 4 * vectorSize, block[k]);
       }
 
-      ri += n*4*vectorSize;
+      ri += n * 4 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
     const Index vectorSize = quad_traits<Scalar>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + vectorSize <= rows; j+=vectorSize)
-    {
+    for (; j + vectorSize <= rows; j += vectorSize) {
       const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
       Index i = 0;
 
-      if(PanelMode) ri += vectorSize*offset;
+      if (PanelMode) ri += vectorSize * offset;
 
       dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
       dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
       dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
 
-      for(; i < depth; i++)
-      {
-        if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
-        {
+      for (; i < depth; i++) {
+        if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
           if (UseLhs) {
-            blockA[ri+0] = lhs2(0, i);
-            blockA[ri+1] = lhs2(1, i);
-            blockA[ri+2] = lhs2(2, i);
-            blockA[ri+3] = lhs2(3, i);
+            blockA[ri + 0] = lhs2(0, i);
+            blockA[ri + 1] = lhs2(1, i);
+            blockA[ri + 2] = lhs2(2, i);
+            blockA[ri + 3] = lhs2(3, i);
           } else {
-            blockA[ri+0] = lhs2(i, 0);
-            blockA[ri+1] = lhs2(i, 1);
-            blockA[ri+2] = lhs2(i, 2);
-            blockA[ri+3] = lhs2(i, 3);
+            blockA[ri + 0] = lhs2(i, 0);
+            blockA[ri + 1] = lhs2(i, 1);
+            blockA[ri + 2] = lhs2(i, 2);
+            blockA[ri + 3] = lhs2(i, 3);
           }
         } else {
           Packet lhsV;
@@ -704,34 +624,28 @@
         ri += vectorSize;
       }
 
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
     }
 
-    if (!UseLhs)
-    {
-      if(PanelMode) ri += offset;
+    if (!UseLhs) {
+      if (PanelMode) ri += offset;
 
-      for(; j < rows; j++)
-      {
+      for (; j < rows; j++) {
         const DataMapper lhs2 = lhs.getSubMapper(0, j);
-        for(Index i = 0; i < depth; i++)
-        {
+        for (Index i = 0; i < depth; i++) {
           blockA[ri] = lhs2(i, 0);
           ri += 1;
         }
 
-        if(PanelMode) ri += stride - depth;
+        if (PanelMode) ri += stride - depth;
       }
     } else {
-      if (j < rows)
-      {
-        if(PanelMode) ri += offset*(rows - j);
+      if (j < rows) {
+        if (PanelMode) ri += offset * (rows - j);
 
-        for(Index i = 0; i < depth; i++)
-        {
+        for (Index i = 0; i < depth; i++) {
           Index k = j;
-          for(; k < rows; k++)
-          {
+          for (; k < rows; k++) {
             blockA[ri] = lhs(k, i);
             ri += 1;
           }
@@ -742,64 +656,57 @@
 };
 
 // General template for lhs packing, float64 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
-{
-  template<Index n>
-  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
-  {
-    PacketBlock<Packet2d,2> block[n];
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true> {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet2d, 2> block[n];
 
-    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
-    {
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
       for (Index k = 0; k < n; k++) {
-        if(StorageOrder == RowMajor)
-        {
-          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize);
-          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k*vectorSize);
+        if (StorageOrder == RowMajor) {
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k * vectorSize);
         } else {
-          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 0);
-          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 1);
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 0);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 1);
         }
       }
 
-      if(StorageOrder == RowMajor)
-      {
+      if (StorageOrder == RowMajor) {
         for (Index k = 0; k < n; k++) {
           ptranspose(block[k]);
         }
       }
 
       for (Index k = 0; k < n; k++) {
-        storeBlock<double, Packet2d, 2>(blockA + ri + k*2*vectorSize, block[k]);
+        storeBlock<double, Packet2d, 2>(blockA + ri + k * 2 * vectorSize, block[k]);
       }
 
-      ri += n*2*vectorSize;
+      ri += n * 2 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
     const Index vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + vectorSize <= rows; j+=vectorSize)
-    {
+    for (; j + vectorSize <= rows; j += vectorSize) {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
-      if(PanelMode) ri += vectorSize*offset;
+      if (PanelMode) ri += vectorSize * offset;
 
       dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
       dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
       dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
 
-      for(; i < depth; i++)
-      {
-        if(StorageOrder == RowMajor)
-        {
-          blockA[ri+0] = lhs2(0, i);
-          blockA[ri+1] = lhs2(1, i);
+      for (; i < depth; i++) {
+        if (StorageOrder == RowMajor) {
+          blockA[ri + 0] = lhs2(0, i);
+          blockA[ri + 1] = lhs2(1, i);
         } else {
           Packet2d lhsV = lhs2.template loadPacket<Packet2d>(0, i);
           pstore<double>(blockA + ri, lhsV);
@@ -808,18 +715,15 @@
         ri += vectorSize;
       }
 
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
     }
 
-    if (j < rows)
-    {
-      if(PanelMode) ri += offset*(rows - j);
+    if (j < rows) {
+      if (PanelMode) ri += offset * (rows - j);
 
-      for(Index i = 0; i < depth; i++)
-      {
+      for (Index i = 0; i < depth; i++) {
         Index k = j;
-        for(; k < rows; k++)
-        {
+        for (; k < rows; k++) {
           blockA[ri] = lhs(k, i);
           ri += 1;
         }
@@ -829,34 +733,30 @@
 };
 
 // General template for rhs packing, float64 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
-{
-  template<Index n>
-  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
-  {
-    PacketBlock<Packet2d,2> block1[n], block2[n];
-    PacketBlock<Packet2d,4> block3[n];
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false> {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet2d, 2> block1[n], block2[n];
+    PacketBlock<Packet2d, 4> block3[n];
 
-    for(; i + n*vectorSize <= depth; i+=n*vectorSize)
-    {
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
       for (Index k = 0; k < n; k++) {
-        if(StorageOrder == ColMajor)
-        {
-          block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 0);
-          block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 1);
-          block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 2);
-          block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 3);
+        if (StorageOrder == ColMajor) {
+          block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 0);
+          block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 1);
+          block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 2);
+          block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 3);
         } else {
-          block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 0); //[a1 a2]
-          block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 2); //[a3 a4]
-          block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 0); //[b1 b2]
-          block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 2); //[b3 b4]
+          block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 0);  //[a1 a2]
+          block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 2);  //[a3 a4]
+          block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 0);  //[b1 b2]
+          block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 2);  //[b3 b4]
         }
       }
 
-      if(StorageOrder == ColMajor)
-      {
+      if (StorageOrder == ColMajor) {
         for (Index k = 0; k < n; k++) {
           ptranspose(block1[k]);
           ptranspose(block2[k]);
@@ -864,48 +764,44 @@
       }
 
       for (Index k = 0; k < n; k++) {
-        if(StorageOrder == ColMajor)
-        {
-          pstore<double>(blockB + ri + k*4*vectorSize    , block1[k].packet[0]);
-          pstore<double>(blockB + ri + k*4*vectorSize + 2, block2[k].packet[0]);
-          pstore<double>(blockB + ri + k*4*vectorSize + 4, block1[k].packet[1]);
-          pstore<double>(blockB + ri + k*4*vectorSize + 6, block2[k].packet[1]);
+        if (StorageOrder == ColMajor) {
+          pstore<double>(blockB + ri + k * 4 * vectorSize, block1[k].packet[0]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 2, block2[k].packet[0]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 4, block1[k].packet[1]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 6, block2[k].packet[1]);
         } else {
-          storeBlock<double, Packet2d, 4>(blockB + ri + k*4*vectorSize, block3[k]);
+          storeBlock<double, Packet2d, 4>(blockB + ri + k * 4 * vectorSize, block3[k]);
         }
       }
 
-      ri += n*4*vectorSize;
+      ri += n * 4 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+                                      Index offset) {
     const Index vectorSize = quad_traits<double>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
-    {
+    for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
       const DataMapper rhs2 = rhs.getSubMapper(0, j);
       Index i = 0;
 
-      if(PanelMode) ri += offset*(2*vectorSize);
+      if (PanelMode) ri += offset * (2 * vectorSize);
 
       dhs_copy<4>(blockB, rhs2, i, ri, depth, vectorSize);
       dhs_copy<2>(blockB, rhs2, i, ri, depth, vectorSize);
       dhs_copy<1>(blockB, rhs2, i, ri, depth, vectorSize);
 
-      for(; i < depth; i++)
-      {
-        if(StorageOrder == ColMajor)
-        {
-          blockB[ri+0] = rhs2(i, 0);
-          blockB[ri+1] = rhs2(i, 1);
+      for (; i < depth; i++) {
+        if (StorageOrder == ColMajor) {
+          blockB[ri + 0] = rhs2(i, 0);
+          blockB[ri + 1] = rhs2(i, 1);
 
           ri += vectorSize;
 
-          blockB[ri+0] = rhs2(i, 2);
-          blockB[ri+1] = rhs2(i, 3);
+          blockB[ri + 0] = rhs2(i, 2);
+          blockB[ri + 1] = rhs2(i, 3);
         } else {
           Packet2d rhsV = rhs2.template loadPacket<Packet2d>(i, 0);
           pstore<double>(blockB + ri, rhsV);
@@ -918,46 +814,40 @@
         ri += vectorSize;
       }
 
-      if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
+      if (PanelMode) ri += (2 * vectorSize) * (stride - offset - depth);
     }
 
-    if(PanelMode) ri += offset;
+    if (PanelMode) ri += offset;
 
-    for(; j < cols; j++)
-    {
+    for (; j < cols; j++) {
       const DataMapper rhs2 = rhs.getSubMapper(0, j);
-      for(Index i = 0; i < depth; i++)
-      {
+      for (Index i = 0; i < depth; i++) {
         blockB[ri] = rhs2(i, 0);
         ri += 1;
       }
 
-      if(PanelMode) ri += stride - depth;
+      if (PanelMode) ri += stride - depth;
     }
   }
 };
 
 // General template for lhs packing, bfloat16 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true>
-{
-  EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true> {
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
     const Index vectorSize = quad_traits<bfloat16>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + 2*vectorSize <= rows; j+=2*vectorSize)
-    {
+    for (; j + 2 * vectorSize <= rows; j += 2 * vectorSize) {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
-      if(PanelMode) ri += 2*vectorSize*offset;
+      if (PanelMode) ri += 2 * vectorSize * offset;
 
-      if(StorageOrder == ColMajor)
-      {
-        for(; i + 2 <= depth; i+=2)
-        {
-          PacketBlock<Packet8bf,4> block;
+      if (StorageOrder == ColMajor) {
+        for (; i + 2 <= depth; i += 2) {
+          PacketBlock<Packet8bf, 4> block;
 
           block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
           block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
@@ -965,8 +855,8 @@
           block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
 
           Packet8bf t0, t1;
-          t0              = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
-          t1              = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
+          t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
+          t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
           block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
           block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
           block.packet[0] = t0;
@@ -974,200 +864,237 @@
 
           storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
 
-          ri += 2*2*vectorSize;
+          ri += 2 * 2 * vectorSize;
         }
-        if (depth & 1)
-        {
-          PacketBlock<Packet8bf,2> block;
+        if (depth & 1) {
+          PacketBlock<Packet8bf, 2> block;
 
           block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
           block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
 
           storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
 
-          ri += 2*vectorSize;
+          ri += 2 * vectorSize;
         }
       } else {
-        for(; i + vectorSize <= depth; i+=vectorSize)
-        {
-          PacketBlock<Packet8bf,8> block1, block2;
+        for (; i + vectorSize <= depth; i += vectorSize) {
+          PacketBlock<Packet8bf, 8> block1, block2;
 
           bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
           bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
 
           Packet4ui v1[8], v2[8];
 
-          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
-          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
-          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
-          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
-          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
-          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
-          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
-          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
-          v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
-          v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
-          v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
-          v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
-          v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
-          v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
-          v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
-          v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+          v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+          v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+          v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+          v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+          v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+          v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+          v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
 
 #ifdef EIGEN_VECTORIZE_VSX
-          block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
-          block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
-          block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
-          block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
-          block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
-          block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
-          block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
-          block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
-          block2.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
-          block2.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
-          block2.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
-          block2.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
-          block2.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
-          block2.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
-          block2.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
-          block2.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
+          block1.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block1.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block2.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+          block2.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+          block2.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+          block2.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+          block2.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+          block2.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+          block2.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
+          block2.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
 #else
-          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
-          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
-          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
-          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
-          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
-          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
-          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
-          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
-          block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_HI));
-          block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_LO));
-          block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_HI));
-          block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_LO));
-          block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_HI));
-          block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_LO));
-          block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_HI));
-          block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_LO));
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
+          block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_HI));
+          block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_LO));
+          block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_HI));
+          block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_LO));
+          block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_HI));
+          block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_LO));
+          block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_HI));
+          block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_LO));
 #endif
 
-          for(Index M = 0; M < 8; M+=2) {
-            pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2*vectorSize * M), block1.packet[M+0]);
-            pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2*vectorSize * M), block1.packet[M+1]);
-            pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2*vectorSize * M), block2.packet[M+0]);
-            pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2*vectorSize * M), block2.packet[M+1]);
+          for (Index M = 0; M < 8; M += 2) {
+            pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 0]);
+            pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 1]);
+            pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 0]);
+            pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 1]);
           }
 
-          ri += 2*vectorSize*vectorSize;
+          ri += 2 * vectorSize * vectorSize;
         }
-        for(; i + 2 <= depth; i+=2)
-        {
-          for(Index M = 0; M < 2*vectorSize; M++) {
+        for (; i + 2 <= depth; i += 2) {
+          for (Index M = 0; M < 2 * vectorSize; M++) {
             blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
             blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
           }
 
-          ri += 2*2*vectorSize;
+          ri += 2 * 2 * vectorSize;
         }
-        if (depth & 1)
-        {
-          for(Index M = 0; M < 2*vectorSize; M++) {
+        if (depth & 1) {
+          for (Index M = 0; M < 2 * vectorSize; M++) {
             blockA[ri + M] = lhs2(M, i);
           }
-          ri += 2*vectorSize;
+          ri += 2 * vectorSize;
         }
       }
 
-      if(PanelMode) ri += 2*vectorSize*(stride - offset - depth);
+      if (PanelMode) ri += 2 * vectorSize * (stride - offset - depth);
     }
-    for(; j + vectorSize <= rows; j+=vectorSize)
-    {
+    for (; j + vectorSize <= rows; j += vectorSize) {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
-      if(PanelMode) ri += vectorSize*offset;
+      if (PanelMode) ri += vectorSize * offset;
 
-      if(StorageOrder == ColMajor)
-      {
-        for(; i + 2 <= depth; i+=2)
-        {
-          PacketBlock<Packet8bf,2> block;
+      if (StorageOrder == ColMajor) {
+        for (; i + 2 <= depth; i += 2) {
+          PacketBlock<Packet8bf, 2> block;
 
           block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
           block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
 
           Packet8bf t0;
-          t0              = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+          t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
           block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
           block.packet[0] = t0;
 
           storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
 
-          ri += 2*vectorSize;
+          ri += 2 * vectorSize;
         }
-        if (depth & 1)
-        {
+        if (depth & 1) {
           Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
           pstore<bfloat16>(blockA + ri, lhsV);
 
           ri += vectorSize;
         }
       } else {
-        for(; i + vectorSize <= depth; i+=vectorSize)
-        {
-          PacketBlock<Packet8bf,8> block1;
+        for (; i + vectorSize <= depth; i += vectorSize) {
+          PacketBlock<Packet8bf, 8> block1;
 
           bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
 
           Packet4ui v1[8];
 
           // This is transposing and interleaving data
-          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
-          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
-          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
-          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
-          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
-          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
-          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
-          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
 
 #ifdef EIGEN_VECTORIZE_VSX
-          block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
-          block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
-          block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
-          block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
-          block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
-          block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
-          block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
-          block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
+          block1.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block1.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
 #else
-          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
-          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
-          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
-          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
-          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
-          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
-          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
-          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
 #endif
 
-          for(Index M = 0; M < 8; M++) {
+          for (Index M = 0; M < 8; M++) {
             pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
           }
 
-          ri += vectorSize*vectorSize;
+          ri += vectorSize * vectorSize;
         }
-        for(; i + 2 <= depth; i+=2)
-        {
-          for(Index M = 0; M < vectorSize; M++) {
+        for (; i + 2 <= depth; i += 2) {
+          for (Index M = 0; M < vectorSize; M++) {
             blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
             blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
           }
 
-          ri += 2*vectorSize;
+          ri += 2 * vectorSize;
         }
-        if (depth & 1)
-        {
-          for(Index M = 0; M < vectorSize; M++) {
+        if (depth & 1) {
+          for (Index M = 0; M < vectorSize; M++) {
             blockA[ri + M] = lhs2(M, i);
           }
 
@@ -1175,20 +1102,17 @@
         }
       }
 
-      if(PanelMode) ri += vectorSize*(stride - offset - depth);
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
     }
-    if(j + 4 <= rows)
-    {
+    if (j + 4 <= rows) {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
-      if(PanelMode) ri += 4*offset;
+      if (PanelMode) ri += 4 * offset;
 
-      for(; i + 2 <= depth; i+=2)
-      {
-        if(StorageOrder == ColMajor)
-        {
-          PacketBlock<Packet8bf,2> block;
+      for (; i + 2 <= depth; i += 2) {
+        if (StorageOrder == ColMajor) {
+          PacketBlock<Packet8bf, 2> block;
 
           block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
           block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
@@ -1197,58 +1121,51 @@
 
           pstore<bfloat16>(blockA + ri, block.packet[0]);
         } else {
-          blockA[ri+0] = lhs2(0, i + 0);
-          blockA[ri+1] = lhs2(0, i + 1);
-          blockA[ri+2] = lhs2(1, i + 0);
-          blockA[ri+3] = lhs2(1, i + 1);
-          blockA[ri+4] = lhs2(2, i + 0);
-          blockA[ri+5] = lhs2(2, i + 1);
-          blockA[ri+6] = lhs2(3, i + 0);
-          blockA[ri+7] = lhs2(3, i + 1);
+          blockA[ri + 0] = lhs2(0, i + 0);
+          blockA[ri + 1] = lhs2(0, i + 1);
+          blockA[ri + 2] = lhs2(1, i + 0);
+          blockA[ri + 3] = lhs2(1, i + 1);
+          blockA[ri + 4] = lhs2(2, i + 0);
+          blockA[ri + 5] = lhs2(2, i + 1);
+          blockA[ri + 6] = lhs2(3, i + 0);
+          blockA[ri + 7] = lhs2(3, i + 1);
         }
 
-        ri += 2*4;
+        ri += 2 * 4;
       }
-      if (depth & 1)
-      {
-        if(StorageOrder == ColMajor)
-        {
+      if (depth & 1) {
+        if (StorageOrder == ColMajor) {
           Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
 
           pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
         } else {
-          blockA[ri+0] = lhs2(0, i);
-          blockA[ri+1] = lhs2(1, i);
-          blockA[ri+2] = lhs2(2, i);
-          blockA[ri+3] = lhs2(3, i);
+          blockA[ri + 0] = lhs2(0, i);
+          blockA[ri + 1] = lhs2(1, i);
+          blockA[ri + 2] = lhs2(2, i);
+          blockA[ri + 3] = lhs2(3, i);
         }
 
         ri += 4;
       }
 
-      if(PanelMode) ri += 4*(stride - offset - depth);
+      if (PanelMode) ri += 4 * (stride - offset - depth);
       j += 4;
     }
 
-    if (j < rows)
-    {
-      if(PanelMode) ri += offset*(rows - j);
+    if (j < rows) {
+      if (PanelMode) ri += offset * (rows - j);
 
       Index i = 0;
-      for(; i + 2 <= depth; i+=2)
-      {
+      for (; i + 2 <= depth; i += 2) {
         Index k = j;
-        for(; k < rows; k++)
-        {
-          blockA[ri+0] = lhs(k, i + 0);
-          blockA[ri+1] = lhs(k, i + 1);
+        for (; k < rows; k++) {
+          blockA[ri + 0] = lhs(k, i + 0);
+          blockA[ri + 1] = lhs(k, i + 1);
           ri += 2;
         }
       }
-      if (depth & 1)
-      {
-        for(; j < rows; j++)
-        {
+      if (depth & 1) {
+        for (; j < rows; j++) {
           blockA[ri] = lhs(j, i);
           ri += 1;
         }
@@ -1258,51 +1175,55 @@
 };
 
 // General template for rhs packing, bfloat16 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false>
-{
-  EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-  {
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false> {
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+                                      Index offset) {
     const Index vectorSize = quad_traits<bfloat16>::vectorsize;
     Index ri = 0, j = 0;
 
-    for(; j + 4 <= cols; j+=4)
-    {
+    for (; j + 4 <= cols; j += 4) {
       const DataMapper rhs2 = rhs.getSubMapper(0, j);
       Index i = 0;
 
-      if(PanelMode) ri += 4*offset;
+      if (PanelMode) ri += 4 * offset;
 
-      for(; i + vectorSize <= depth; i+=vectorSize)
-      {
-        if(StorageOrder == ColMajor)
-        {
-          PacketBlock<Packet8bf,4> block;
+      for (; i + vectorSize <= depth; i += vectorSize) {
+        if (StorageOrder == ColMajor) {
+          PacketBlock<Packet8bf, 4> block;
 
           bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
 
           Packet4ui t0, t1, t2, t3;
 
-          t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
-          t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
-          t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
-          t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+          t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+          t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+          t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+          t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[3].m_val));
 
 #ifdef EIGEN_VECTORIZE_VSX
-          block.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
-          block.packet[1] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
-          block.packet[2] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
-          block.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
+          block.packet[0] =
+              reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+          block.packet[1] =
+              reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+          block.packet[2] =
+              reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
+          block.packet[3] =
+              reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
 #else
-          block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_HI));
-          block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_LO));
-          block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_HI));
-          block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_LO));
+          block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_HI));
+          block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_LO));
+          block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_HI));
+          block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_LO));
 #endif
 
           storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
         } else {
-          PacketBlock<Packet8bf,8> block;
+          PacketBlock<Packet8bf, 8> block;
 
           for (int M = 0; M < 8; M++) {
             block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
@@ -1320,21 +1241,20 @@
           }
         }
 
-        ri += 4*vectorSize;
+        ri += 4 * vectorSize;
       }
       for (; i + 2 <= depth; i += 2) {
-        if(StorageOrder == ColMajor)
-        {
-          blockB[ri+0] = rhs2(i + 0, 0);
-          blockB[ri+1] = rhs2(i + 1, 0);
-          blockB[ri+2] = rhs2(i + 0, 1);
-          blockB[ri+3] = rhs2(i + 1, 1);
-          blockB[ri+4] = rhs2(i + 0, 2);
-          blockB[ri+5] = rhs2(i + 1, 2);
-          blockB[ri+6] = rhs2(i + 0, 3);
-          blockB[ri+7] = rhs2(i + 1, 3);
+        if (StorageOrder == ColMajor) {
+          blockB[ri + 0] = rhs2(i + 0, 0);
+          blockB[ri + 1] = rhs2(i + 1, 0);
+          blockB[ri + 2] = rhs2(i + 0, 1);
+          blockB[ri + 3] = rhs2(i + 1, 1);
+          blockB[ri + 4] = rhs2(i + 0, 2);
+          blockB[ri + 5] = rhs2(i + 1, 2);
+          blockB[ri + 6] = rhs2(i + 0, 3);
+          blockB[ri + 7] = rhs2(i + 1, 3);
         } else {
-          PacketBlock<Packet8bf,2> block;
+          PacketBlock<Packet8bf, 2> block;
 
           for (int M = 0; M < 2; M++) {
             block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
@@ -1345,40 +1265,34 @@
           pstore<bfloat16>(blockB + ri, block.packet[0]);
         }
 
-        ri += 4*2;
+        ri += 4 * 2;
       }
-      if (depth & 1)
-      {
-        blockB[ri+0] = rhs2(i, 0);
-        blockB[ri+1] = rhs2(i, 1);
-        blockB[ri+2] = rhs2(i, 2);
-        blockB[ri+3] = rhs2(i, 3);
+      if (depth & 1) {
+        blockB[ri + 0] = rhs2(i, 0);
+        blockB[ri + 1] = rhs2(i, 1);
+        blockB[ri + 2] = rhs2(i, 2);
+        blockB[ri + 3] = rhs2(i, 3);
 
         ri += 4;
       }
 
-      if(PanelMode) ri += 4*(stride - offset - depth);
+      if (PanelMode) ri += 4 * (stride - offset - depth);
     }
 
-    if (j < cols)
-    {
-      if(PanelMode) ri += offset*(cols - j);
+    if (j < cols) {
+      if (PanelMode) ri += offset * (cols - j);
 
       Index i = 0;
-      for(; i + 2 <= depth; i+=2)
-      {
+      for (; i + 2 <= depth; i += 2) {
         Index k = j;
-        for(; k < cols; k++)
-        {
-          blockB[ri+0] = rhs(i + 0, k);
-          blockB[ri+1] = rhs(i + 1, k);
+        for (; k < cols; k++) {
+          blockB[ri + 0] = rhs(i + 0, k);
+          blockB[ri + 1] = rhs(i + 1, k);
           ri += 2;
         }
       }
-      if (depth & 1)
-      {
-        for(; j < cols; j++)
-        {
+      if (depth & 1) {
+        for (; j < cols; j++) {
           blockB[ri] = rhs(i, j);
           ri += 1;
         }
@@ -1388,45 +1302,41 @@
 };
 
 // General template for lhs complex packing, float64 specialization.
-template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
-{
-  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
-  {
-    PacketBlock<Packet,2> blockr, blocki;
-    PacketBlock<PacketC,4> cblock;
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true> {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    PacketBlock<Packet, 2> blockr, blocki;
+    PacketBlock<PacketC, 4> cblock;
 
-    for(; i + vectorSize <= depth; i+=vectorSize)
-    {
-      if(StorageOrder == ColMajor)
-      {
-        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
-        cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+    for (; i + vectorSize <= depth; i += vectorSize) {
+      if (StorageOrder == ColMajor) {
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0);  //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1);  //[b1 b1i]
 
-        cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
-        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0);  //[a2 a2i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1);  //[b2 b2i]
 
-        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
-        blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v);  //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v);  //[b1 b2]
 
         blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
         blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
       } else {
-        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
-        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);  //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);  //[a2 a2i]
 
-        cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
-        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1);  //[b1 b1i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1);  //[b2 b2i
 
-        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
-        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);  //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);  //[b1 b2]
 
         blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
         blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
       }
 
-      if(Conjugate)
-      {
+      if (Conjugate) {
         blocki.packet[0] = -blocki.packet[0];
         blocki.packet[1] = -blocki.packet[1];
       }
@@ -1434,21 +1344,20 @@
       storeBlock<double, Packet, 2>(blockAt + rir, blockr);
       storeBlock<double, Packet, 2>(blockAt + rii, blocki);
 
-      rir += 2*vectorSize;
-      rii += 2*vectorSize;
+      rir += 2 * vectorSize;
+      rii += 2 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                      Index stride, Index offset) {
     const Index vectorSize = quad_traits<double>::vectorsize;
     const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
-    Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
-    double* blockAt = reinterpret_cast<double *>(blockA);
+    Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+    double* blockAt = reinterpret_cast<double*>(blockA);
     Index j = 0;
 
-    for(; j + vectorSize <= rows; j+=vectorSize)
-    {
+    for (; j + vectorSize <= rows; j += vectorSize) {
       const DataMapper lhs2 = lhs.getSubMapper(j, 0);
       Index i = 0;
 
@@ -1456,10 +1365,9 @@
 
       dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
 
-      for(; i < depth; i++)
-      {
-        PacketBlock<Packet,1> blockr, blocki;
-        PacketBlock<PacketC,2> cblock;
+      for (; i < depth; i++) {
+        PacketBlock<Packet, 1> blockr, blocki;
+        PacketBlock<PacketC, 2> cblock;
 
         cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
         cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);
@@ -1467,8 +1375,7 @@
         blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
         blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
 
-        if(Conjugate)
-        {
+        if (Conjugate) {
           blocki.packet[0] = -blocki.packet[0];
         }
 
@@ -1479,25 +1386,22 @@
         rii += vectorSize;
       }
 
-      rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+      rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
     }
 
-    if (j < rows)
-    {
-      if(PanelMode) rir += (offset*(rows - j - vectorSize));
+    if (j < rows) {
+      if (PanelMode) rir += (offset * (rows - j - vectorSize));
       rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
 
-      for(Index i = 0; i < depth; i++)
-      {
+      for (Index i = 0; i < depth; i++) {
         Index k = j;
-        for(; k < rows; k++)
-        {
+        for (; k < rows; k++) {
           blockAt[rir] = lhs(k, i).real();
 
-          if(Conjugate)
+          if (Conjugate)
             blockAt[rii] = -lhs(k, i).imag();
           else
-            blockAt[rii] =  lhs(k, i).imag();
+            blockAt[rii] = lhs(k, i).imag();
 
           rir += 1;
           rii += 1;
@@ -1508,15 +1412,13 @@
 };
 
 // General template for rhs complex packing, float64 specialization.
-template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
-{
-  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
-  {
-    for(; i < depth; i++)
-    {
-      PacketBlock<PacketC,4> cblock;
-      PacketBlock<Packet,2> blockr, blocki;
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false> {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    for (; i < depth; i++) {
+      PacketBlock<PacketC, 4> cblock;
+      PacketBlock<Packet, 2> blockr, blocki;
 
       bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
 
@@ -1526,8 +1428,7 @@
       blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
       blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
 
-      if(Conjugate)
-      {
+      if (Conjugate) {
         blocki.packet[0] = -blocki.packet[0];
         blocki.packet[1] = -blocki.packet[1];
       }
@@ -1535,21 +1436,20 @@
       storeBlock<double, Packet, 2>(blockBt + rir, blockr);
       storeBlock<double, Packet, 2>(blockBt + rii, blocki);
 
-      rir += 2*vectorSize;
-      rii += 2*vectorSize;
+      rir += 2 * vectorSize;
+      rii += 2 * vectorSize;
     }
   }
 
-  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-  {
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols,
+                                      Index stride, Index offset) {
     const Index vectorSize = quad_traits<double>::vectorsize;
-    const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
-    Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
-    double* blockBt = reinterpret_cast<double *>(blockB);
+    const Index vectorDelta = 2 * vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (2 * vectorSize * offset) : 0), rii;
+    double* blockBt = reinterpret_cast<double*>(blockB);
     Index j = 0;
 
-    for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
-    {
+    for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
       const DataMapper rhs2 = rhs.getSubMapper(0, j);
       Index i = 0;
 
@@ -1557,30 +1457,28 @@
 
       dhs_ccopy(blockBt, rhs2, i, rir, rii, depth, vectorSize);
 
-      rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
+      rir += ((PanelMode) ? (2 * vectorSize * (2 * stride - depth)) : vectorDelta);
     }
 
-    if(PanelMode) rir -= (offset*(2*vectorSize - 1));
+    if (PanelMode) rir -= (offset * (2 * vectorSize - 1));
 
-    for(; j < cols; j++)
-    {
+    for (; j < cols; j++) {
       const DataMapper rhs2 = rhs.getSubMapper(0, j);
       rii = rir + ((PanelMode) ? stride : depth);
 
-      for(Index i = 0; i < depth; i++)
-      {
+      for (Index i = 0; i < depth; i++) {
         blockBt[rir] = rhs2(i, 0).real();
 
-        if(Conjugate)
+        if (Conjugate)
           blockBt[rii] = -rhs2(i, 0).imag();
         else
-          blockBt[rii] =  rhs2(i, 0).imag();
+          blockBt[rii] = rhs2(i, 0).imag();
 
         rir += 1;
         rii += 1;
       }
 
-      rir += ((PanelMode) ? (2*stride - depth) : depth);
+      rir += ((PanelMode) ? (2 * stride - depth) : depth);
     }
   }
 };
@@ -1590,11 +1488,9 @@
  **************/
 
 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
-template<typename Packet, bool NegativeAccumulate, int N>
-EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,N>* acc, const Packet& lhsV, const Packet* rhsV)
-{
-  if(NegativeAccumulate)
-  {
+template <typename Packet, bool NegativeAccumulate, int N>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet, N>* acc, const Packet& lhsV, const Packet* rhsV) {
+  if (NegativeAccumulate) {
     for (int M = 0; M < N; M++) {
       acc->packet[M] = vec_nmsub(lhsV, rhsV[M], acc->packet[M]);
     }
@@ -1605,21 +1501,20 @@
   }
 }
 
-template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
-{
+template <int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet, N>* acc, const Scalar* lhs, const Packet* rhsV) {
   Packet lhsV = pload<Packet>(lhs);
 
   pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
 }
 
-// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
-template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
-{
+// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types
+// real * complex and complex * real.
+template <int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag,
+                                      const Packet& lhsV, Packet& lhsVi, const Packet* rhsV, const Packet* rhsVi) {
   pger_common<Packet, false, N>(accReal, lhsV, rhsV);
-  if(LhsIsReal)
-  {
+  if (LhsIsReal) {
     pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
     EIGEN_UNUSED_VARIABLE(lhsVi);
   } else {
@@ -1633,52 +1528,52 @@
   }
 }
 
-template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
-{
+template <int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag, const Scalar* lhs_ptr,
+                               const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) {
   Packet lhsV = ploadLhs<Packet>(lhs_ptr);
   Packet lhsVi;
-  if(!LhsIsReal) lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
-  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  if (!LhsIsReal)
+    lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
+  else
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
 
   pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
 }
 
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs) {
   return ploadu<Packet>(lhs);
 }
 
 // Zero the accumulator on PacketBlock.
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,N>& acc)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet, N>& acc) {
   for (int M = 0; M < N; M++) {
     acc.packet[M] = pset1<Packet>((__UNPACK_TYPE__(Packet))0);
   }
 }
 
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ,
+                                        const Packet& pAlpha) {
   for (int M = 0; M < N; M++) {
     acc.packet[M] = vec_mul(accZ.packet[M], pAlpha);
   }
 }
 
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,N>& acc, const Packet& pMask)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet, N>& acc, const Packet& pMask) {
   for (int M = 0; M < N; M++) {
     acc.packet[M] = pand<Packet>(acc.packet[M], pMask);
   }
 }
 
 // Complex version of PacketBlock scaling.
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask)
-{
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+                                 const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+                                 const Packet& pMask) {
   if (mask && (sizeof(__UNPACK_TYPE__(Packet)) == sizeof(float))) {
     band<Packet, N>(aReal, pMask);
     band<Packet, N>(aImag, pMask);
@@ -1698,16 +1593,16 @@
 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
 //
 // full = operate (load) on the entire PacketBlock or only half
-template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
-{
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                               Index col) {
   if (StorageOrder == RowMajor) {
     for (int M = 0; M < N; M++) {
       acc.packet[M] = res.template loadPacket<Packet>(row + M, col);
     }
     if (Complex) {
       for (int M = 0; M < N; M++) {
-        acc.packet[M+N] = res.template loadPacket<Packet>(row + M, col + accCols);
+        acc.packet[M + N] = res.template loadPacket<Packet>(row + M, col + accCols);
       }
     }
   } else {
@@ -1716,37 +1611,35 @@
     }
     if (Complex && full) {
       for (int M = 0; M < N; M++) {
-        acc.packet[M+N] = res.template loadPacket<Packet>(row + accCols, col + M);
+        acc.packet[M + N] = res.template loadPacket<Packet>(row + accCols, col + M);
       }
     }
   }
 }
 
-template<typename DataMapper, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row)
-{
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row) {
   for (int M = 0; M < N; M++) {
     res.template storePacket<Packet>(row, M, acc.packet[M]);
   }
 }
 
 #ifdef USE_PARTIAL_PACKETS
-template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
-EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements)
-{
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                                       Index elements) {
   for (Index M = 0; M < N; M++) {
     acc.packet[M] = res.template loadPacketPartial<Packet>(row, M, elements);
   }
   if (Complex && full) {
     for (Index M = 0; M < N; M++) {
-      acc.packet[M+N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
+      acc.packet[M + N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
     }
   }
 }
 
-template<typename DataMapper, typename Packet, Index N>
-EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements)
-{
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements) {
   for (Index M = 0; M < N; M++) {
     res.template storePacketPartial<Packet>(row, M, acc.packet[M], elements);
   }
@@ -1760,12 +1653,11 @@
 #endif
 
 #if !USE_P10_AND_PVIPR2_0
-const static Packet4i mask4[4] = { {  0,  0,  0,  0 }, { -1,  0,  0,  0 }, { -1, -1,  0,  0 }, { -1, -1, -1,  0 } };
+const static Packet4i mask4[4] = {{0, 0, 0, 0}, {-1, 0, 0, 0}, {-1, -1, 0, 0}, {-1, -1, -1, 0}};
 #endif
 
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) {
 #if USE_P10_AND_PVIPR2_0
 #ifdef _BIG_ENDIAN
   return Packet(vec_reve(vec_genwm((1 << remaining_rows) - 1)));
@@ -1777,9 +1669,8 @@
 #endif
 }
 
-template<>
-EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows) {
 #if USE_P10_AND_PVIPR2_0
   Packet2d mask2 = Packet2d(vec_gendm(remaining_rows));
 #ifdef _BIG_ENDIAN
@@ -1788,23 +1679,22 @@
   return mask2;
 #endif
 #else
-  Packet2l ret = { -remaining_rows, 0 };
+  Packet2l ret = {-remaining_rows, 0};
   return Packet2d(ret);
 #endif
 }
 
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha) {
   for (int M = 0; M < N; M++) {
     acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
   }
 }
 
 // Scale the PacketBlock vectors by alpha.
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
-{
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+                                const Packet& pMask) {
   if (mask) {
     band<Packet, N>(accZ, pMask);
   } else {
@@ -1814,11 +1704,10 @@
   bscale<Packet, N>(acc, accZ, pAlpha);
 }
 
-template<typename Packet, int N, bool real>
-EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) *ap0,
-        const __UNPACK_TYPE__(Packet) *ap1, const __UNPACK_TYPE__(Packet) *ap2,
-        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
+template <typename Packet, int N, bool real>
+EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) * ap0, const __UNPACK_TYPE__(Packet) * ap1,
+                                     const __UNPACK_TYPE__(Packet) * ap2, Packet& a0, Packet& a1, Packet& a2,
+                                     Packet& a3) {
   a0 = pset1<Packet>(ap0[0]);
   if (N == 4) {
     a1 = pset1<Packet>(ap0[1]);
@@ -1842,24 +1731,21 @@
   }
 }
 
-template<> EIGEN_ALWAYS_INLINE void
-pbroadcastN<Packet4f,4,true>(const float *ap0, const float *, const float *,
-                             Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, true>(const float* ap0, const float*, const float*, Packet4f& a0,
+                                                        Packet4f& a1, Packet4f& a2, Packet4f& a3) {
   pbroadcast4<Packet4f>(ap0, a0, a1, a2, a3);
 }
 
-template<> EIGEN_ALWAYS_INLINE void
-pbroadcastN<Packet4f,4,false>(const float *ap0, const float *ap1, const float *ap2,
-                              Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  pbroadcastN<Packet4f,4,true>(ap0, ap1, ap2, a0, a1, a2, a3);
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, false>(const float* ap0, const float* ap1, const float* ap2,
+                                                         Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  pbroadcastN<Packet4f, 4, true>(ap0, ap1, ap2, a0, a1, a2, a3);
 }
 
-template<>
-EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d,4,false>(const double* ap0, const double *,
-    const double *, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d, 4, false>(const double* ap0, const double*, const double*, Packet2d& a0,
+                                                         Packet2d& a1, Packet2d& a2, Packet2d& a3) {
   a1 = pload<Packet2d>(ap0);
   a3 = pload<Packet2d>(ap0 + 2);
   a0 = vec_splat(a1, 0);
@@ -1869,9 +1755,9 @@
 }
 
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
-{
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                        PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2) {
   for (int M = 0; M < N; M++) {
     acc1.packet[M].v = vec_mergeh(taccReal.packet[M], taccImag.packet[M]);
   }
@@ -1883,9 +1769,10 @@
   }
 }
 
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
-{
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                 PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+                                 PacketBlock<Packetc, N>& acc2) {
   bcouple_common<Packet, Packetc, N, full>(taccReal, taccImag, acc1, acc2);
 
   for (int M = 0; M < N; M++) {
@@ -1894,7 +1781,7 @@
 
   if (full) {
     for (int M = 0; M < N; M++) {
-      acc2.packet[M] = padd<Packetc>(tRes.packet[M+N], acc2.packet[M]);
+      acc2.packet[M] = padd<Packetc>(tRes.packet[M + N], acc2.packet[M]);
     }
   }
 }
@@ -1903,143 +1790,132 @@
 #define PEEL 7
 #define PEEL_ROW 7
 
-#define MICRO_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+#define MICRO_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 
-#define MICRO_NORMAL_ROWS \
-  accRows == quad_traits<Scalar>::rows || accRows == 1
+#define MICRO_NORMAL_ROWS accRows == quad_traits<Scalar>::rows || accRows == 1
 
 #define MICRO_NEW_ROWS ((MICRO_NORMAL_ROWS) ? accRows : 1)
 
 #define MICRO_RHS(ptr, N) rhs_##ptr##N
 
-#define MICRO_ZERO_PEEL(peel) \
-  if ((PEEL_ROW > peel) && (peel != 0)) { \
+#define MICRO_ZERO_PEEL(peel)                 \
+  if ((PEEL_ROW > peel) && (peel != 0)) {     \
     bsetzero<Packet, accRows>(accZero##peel); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accZero##peel); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accZero##peel);     \
   }
 
-#define MICRO_ADD(ptr, N) \
-  if (MICRO_NORMAL_ROWS) { \
-    MICRO_RHS(ptr,0) += (accRows * N); \
-  } else { \
-    MICRO_RHS(ptr,0) += N; \
-    MICRO_RHS(ptr,1) += N; \
-    if (accRows == 3) { \
-       MICRO_RHS(ptr,2) += N; \
-    } \
+#define MICRO_ADD(ptr, N)               \
+  if (MICRO_NORMAL_ROWS) {              \
+    MICRO_RHS(ptr, 0) += (accRows * N); \
+  } else {                              \
+    MICRO_RHS(ptr, 0) += N;             \
+    MICRO_RHS(ptr, 1) += N;             \
+    if (accRows == 3) {                 \
+      MICRO_RHS(ptr, 2) += N;           \
+    }                                   \
   }
 
 #define MICRO_ADD_ROWS(N) MICRO_ADD(ptr, N)
 
-#define MICRO_BROADCAST1(peel, ptr, rhsV, real) \
-  if (MICRO_NORMAL_ROWS) { \
-    pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + (accRows * peel), MICRO_RHS(ptr,0), MICRO_RHS(ptr,0), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
-  } else { \
-    pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + peel, MICRO_RHS(ptr,1) + peel, MICRO_RHS(ptr,2) + peel, rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+#define MICRO_BROADCAST1(peel, ptr, rhsV, real)                                                                      \
+  if (MICRO_NORMAL_ROWS) {                                                                                           \
+    pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + (accRows * peel), MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 0),   \
+                                       rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]);                  \
+  } else {                                                                                                           \
+    pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + peel, MICRO_RHS(ptr, 1) + peel, MICRO_RHS(ptr, 2) + peel, \
+                                       rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]);                  \
   }
 
 #define MICRO_BROADCAST(peel) MICRO_BROADCAST1(peel, ptr, rhsV, true)
 
-#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real) \
-  pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0), MICRO_RHS(ptr,1), MICRO_RHS(ptr,2), rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real)                                                                 \
+  pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 1), MICRO_RHS(ptr, 2), rhsV[0], rhsV[1], \
+                                     rhsV[2], rhsV[3]);
 
-#define MICRO_BROADCAST_EXTRA \
-  Packet rhsV[4]; \
+#define MICRO_BROADCAST_EXTRA             \
+  Packet rhsV[4];                         \
   MICRO_BROADCAST_EXTRA1(ptr, rhsV, true) \
   MICRO_ADD_ROWS(1)
 
-#define MICRO_SRC2(ptr, N, M) \
-  if (MICRO_NORMAL_ROWS) { \
-    EIGEN_UNUSED_VARIABLE(strideB); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,1)); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
-  } else { \
-    MICRO_RHS(ptr,1) = rhs_base + N + M; \
-    if (accRows == 3) { \
-      MICRO_RHS(ptr,2) = rhs_base + N*2 + M; \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
-    } \
+#define MICRO_SRC2(ptr, N, M)                   \
+  if (MICRO_NORMAL_ROWS) {                      \
+    EIGEN_UNUSED_VARIABLE(strideB);             \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 1));   \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2));   \
+  } else {                                      \
+    MICRO_RHS(ptr, 1) = rhs_base + N + M;       \
+    if (accRows == 3) {                         \
+      MICRO_RHS(ptr, 2) = rhs_base + N * 2 + M; \
+    } else {                                    \
+      EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2)); \
+    }                                           \
   }
 
 #define MICRO_SRC2_PTR MICRO_SRC2(ptr, strideB, 0)
 
 #define MICRO_ZERO_PEEL_ROW MICRO_UNROLL(MICRO_ZERO_PEEL)
 
-#define MICRO_WORK_PEEL(peel) \
-  if (PEEL_ROW > peel) { \
-    MICRO_BROADCAST(peel) \
+#define MICRO_WORK_PEEL(peel)                                                                            \
+  if (PEEL_ROW > peel) {                                                                                 \
+    MICRO_BROADCAST(peel)                                                                                \
     pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+  } else {                                                                                               \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                                                                   \
   }
 
-#define MICRO_WORK_PEEL_ROW \
+#define MICRO_WORK_PEEL_ROW                                                              \
   Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
-  MICRO_UNROLL(MICRO_WORK_PEEL) \
-  lhs_ptr += (remaining_rows * PEEL_ROW); \
+  MICRO_UNROLL(MICRO_WORK_PEEL)                                                          \
+  lhs_ptr += (remaining_rows * PEEL_ROW);                                                \
   MICRO_ADD_ROWS(PEEL_ROW)
 
-#define MICRO_ADD_PEEL(peel, sum) \
-  if (PEEL_ROW > peel) { \
-    for (Index i = 0; i < accRows; i++) { \
+#define MICRO_ADD_PEEL(peel, sum)                        \
+  if (PEEL_ROW > peel) {                                 \
+    for (Index i = 0; i < accRows; i++) {                \
       accZero##sum.packet[i] += accZero##peel.packet[i]; \
-    } \
+    }                                                    \
   }
 
 #define MICRO_ADD_PEEL_ROW \
-  MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
-  MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
+  MICRO_ADD_PEEL(4, 0)     \
+  MICRO_ADD_PEEL(5, 1)     \
+  MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
 
-#define MICRO_PREFETCHN1(ptr, N) \
-  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,0)); \
-  if (N == 2 || N == 3) { \
-    EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,1)); \
-    if (N == 3) { \
-      EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,2)); \
-    } \
+#define MICRO_PREFETCHN1(ptr, N)               \
+  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 0));     \
+  if (N == 2 || N == 3) {                      \
+    EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 1));   \
+    if (N == 3) {                              \
+      EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 2)); \
+    }                                          \
   }
 
 #define MICRO_PREFETCHN(N) MICRO_PREFETCHN1(ptr, N)
 
 #define MICRO_COMPLEX_PREFETCHN(N) \
-  MICRO_PREFETCHN1(ptr_real, N); \
-  if(!RhsIsReal) { \
+  MICRO_PREFETCHN1(ptr_real, N);   \
+  if (!RhsIsReal) {                \
     MICRO_PREFETCHN1(ptr_imag, N); \
   }
 
-template<typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
-  const Scalar* &lhs_ptr,
-  const Scalar* &rhs_ptr0,
-  const Scalar* &rhs_ptr1,
-  const Scalar* &rhs_ptr2,
-  PacketBlock<Packet,accRows> &accZero)
-{
+template <typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar*& lhs_ptr, const Scalar*& rhs_ptr0, const Scalar*& rhs_ptr1,
+                                         const Scalar*& rhs_ptr2, PacketBlock<Packet, accRows>& accZero) {
   MICRO_BROADCAST_EXTRA
   pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
   lhs_ptr += remaining_rows;
 }
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
-  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
-  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
-  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols,
+          const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                     const Scalar* rhs_base, Index depth, Index strideA, Index offsetA,
+                                                     Index strideB, Index row, Index rows, const Packet& pAlpha,
+                                                     const Packet& pMask) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+  const Scalar* lhs_ptr = lhs_base + row * strideA + remaining_rows * offsetA;
+  PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
 
   MICRO_SRC2_PTR
   bsetzero<Packet, accRows>(accZero0);
@@ -2048,16 +1924,14 @@
   Index k = 0;
   if (remaining_depth >= PEEL_ROW) {
     MICRO_ZERO_PEEL_ROW
-    do
-    {
+    do {
       MICRO_PREFETCHN(accRows)
       EIGEN_POWER_PREFETCH(lhs_ptr);
       MICRO_WORK_PEEL_ROW
     } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
     MICRO_ADD_PEEL_ROW
   }
-  for(; k < depth; k++)
-  {
+  for (; k < depth; k++) {
     MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
   }
 
@@ -2065,18 +1939,17 @@
   EIGEN_UNUSED_VARIABLE(rows);
   EIGEN_UNUSED_VARIABLE(pMask);
   bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row, remaining_rows);
-  bscale<Packet,accRows>(acc, accZero0, pAlpha);
+  bscale<Packet, accRows>(acc, accZero0, pAlpha);
   bstore_partial<DataMapper, Packet, accRows>(acc, res, row, remaining_rows);
 #else
   bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
-  if ((accRows == 1) || (rows >= accCols))
-  {
-    bscale<Packet,accRows,true>(acc, accZero0, pAlpha, pMask);
+  if ((accRows == 1) || (rows >= accCols)) {
+    bscale<Packet, accRows, true>(acc, accZero0, pAlpha, pMask);
     bstore<DataMapper, Packet, accRows>(acc, res, row);
   } else {
-    bscale<Packet,accRows,false>(acc, accZero0, pAlpha, pMask);
-    for(Index j = 0; j < accRows; j++) {
-      for(Index i = 0; i < remaining_rows; i++) {
+    bscale<Packet, accRows, false>(acc, accZero0, pAlpha, pMask);
+    for (Index j = 0; j < accRows; j++) {
+      for (Index i = 0; i < remaining_rows; i++) {
         res(row + i, j) = acc.packet[j][i];
       }
     }
@@ -2084,75 +1957,62 @@
 #endif
 }
 
-#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \
-  switch(value) { \
-    default: \
-      MICRO_EXTRA_UNROLL(1) \
-      break; \
-    case 2: \
+#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col)   \
+  switch (value) {                                       \
+    default:                                             \
+      MICRO_EXTRA_UNROLL(1)                              \
+      break;                                             \
+    case 2:                                              \
       if (is_col || (sizeof(Scalar) == sizeof(float))) { \
-        MICRO_EXTRA_UNROLL(2) \
-      } \
-      break; \
-    case 3: \
+        MICRO_EXTRA_UNROLL(2)                            \
+      }                                                  \
+      break;                                             \
+    case 3:                                              \
       if (is_col || (sizeof(Scalar) == sizeof(float))) { \
-        MICRO_EXTRA_UNROLL(3) \
-      } \
-      break; \
+        MICRO_EXTRA_UNROLL(3)                            \
+      }                                                  \
+      break;                                             \
   }
 
-#define MICRO_EXTRA_ROWS(N) \
-  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
+#define MICRO_EXTRA_ROWS(N)                                                     \
+  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>( \
+      res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_row(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                        Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+                                        Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
   MICRO_EXTRA(MICRO_EXTRA_ROWS, remaining_rows, false)
 }
 
 #define MICRO_UNROLL_WORK(func, func2, peel) \
-  MICRO_UNROLL(func2); \
-  func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
-  func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+  MICRO_UNROLL(func2);                       \
+  func(0, peel) func(1, peel) func(2, peel) func(3, peel) func(4, peel) func(5, peel) func(6, peel) func(7, peel)
 
-#define MICRO_WORK_ONE(iter, peel) \
-  if (unroll_factor > iter) { \
+#define MICRO_WORK_ONE(iter, peel)                                               \
+  if (unroll_factor > iter) {                                                    \
     pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
   }
 
-#define MICRO_TYPE_PEEL4(func, func2, peel) \
-  if (PEEL > peel) { \
+#define MICRO_TYPE_PEEL4(func, func2, peel)                        \
+  if (PEEL > peel) {                                               \
     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    MICRO_BROADCAST(peel) \
-    MICRO_UNROLL_WORK(func, func2, peel) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    MICRO_BROADCAST(peel)                                          \
+    MICRO_UNROLL_WORK(func, func2, peel)                           \
+  } else {                                                         \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                             \
   }
 
-#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
-  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
-  func(func1,func2,0) func(func1,func2,1) \
-  func(func1,func2,2) func(func1,func2,3) \
-  func(func1,func2,4) func(func1,func2,5) \
-  func(func1,func2,6) func(func1,func2,7)
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2)                                                           \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M];                        \
+  func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3) func(func1, func2, 4) \
+      func(func1, func2, 5) func(func1, func2, 6) func(func1, func2, 7)
 
 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
-  Packet rhsV0[M]; \
-  func(func1,func2,0)
+  Packet rhsV0[M];                                   \
+  func(func1, func2, 0)
 
-#define MICRO_UNROLL_TYPE(MICRO_TYPE, size) \
+#define MICRO_UNROLL_TYPE(MICRO_TYPE, size)                       \
   MICRO_TYPE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE) \
   MICRO_ADD_ROWS(size)
 
@@ -2160,11 +2020,11 @@
 
 #define MICRO_ONE4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_ONE, 1)
 
-#define MICRO_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter) { \
+#define MICRO_DST_PTR_ONE(iter)               \
+  if (unroll_factor > iter) {                 \
     bsetzero<Packet, accRows>(accZero##iter); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accZero##iter);     \
   }
 
 #define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
@@ -2174,69 +2034,62 @@
 #define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
 
 #ifdef USE_PARTIAL_PACKETS
-#define MICRO_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    if (MICRO_NORMAL_PARTIAL(iter)) { \
-      bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
-      bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
-      bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
-    } else { \
-      bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter*accCols, accCols2); \
-      bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
-      bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter*accCols, accCols2); \
-    } \
+#define MICRO_STORE_ONE(iter)                                                                         \
+  if (unroll_factor > iter) {                                                                         \
+    if (MICRO_NORMAL_PARTIAL(iter)) {                                                                 \
+      bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0);      \
+      bscale<Packet, accRows>(acc, accZero##iter, pAlpha);                                            \
+      bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols);                            \
+    } else {                                                                                          \
+      bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter * accCols, accCols2); \
+      bscale<Packet, accRows>(acc, accZero##iter, pAlpha);                                            \
+      bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter * accCols, accCols2);          \
+    }                                                                                                 \
   }
 #else
-#define MICRO_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
-    bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
-    bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
+#define MICRO_STORE_ONE(iter)                                                                  \
+  if (unroll_factor > iter) {                                                                  \
+    bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0); \
+    bscale<Packet, accRows, !(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask);         \
+    bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols);                       \
   }
 #endif
 
 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
 
 #ifdef USE_PARTIAL_PACKETS
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, bool full>
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+          const Index accCols, bool full>
 #else
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+          const Index accCols, const Index accCols2>
 #endif
-EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index& row,
-  const Packet& pAlpha,
+EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB, Index& row,
+                                                 const Packet& pAlpha,
 #ifdef USE_PARTIAL_PACKETS
-  Index accCols2
+                                                 Index accCols2
 #else
-  const Packet& pMask
+                                                 const Packet& pMask
 #endif
-  )
-{
-  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
-  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
-  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
-  PacketBlock<Packet,accRows> acc;
+) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+  const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+               *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
+  PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet, accRows> acc;
 
   MICRO_SRC2_PTR
   MICRO_SRC_PTR
   MICRO_DST_PTR
 
   Index k = 0;
-  for(; k + PEEL <= depth; k+= PEEL)
-  {
+  for (; k + PEEL <= depth; k += PEEL) {
     MICRO_PREFETCHN(accRows)
     MICRO_PREFETCH
     MICRO_ONE_PEEL4
   }
-  for(; k < depth; k++)
-  {
+  for (; k < depth; k++) {
     MICRO_ONE4
   }
   MICRO_STORE
@@ -2245,42 +2098,32 @@
 }
 
 #ifdef USE_PARTIAL_PACKETS
-#define MICRO_UNROLL_ITER2(N, M) \
-  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
+#define MICRO_UNROLL_ITER2(N, M)                                                                              \
+  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(               \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
   if (M) return;
 #else
-#define MICRO_UNROLL_ITER2(N, M) \
-  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
+#define MICRO_UNROLL_ITER2(N, M)                                                                             \
+  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>( \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask);                       \
   if (M) return;
 #endif
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                   Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+                                   Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
   const DataMapper res3 = res.getSubMapper(0, col);
 
-  const Scalar* rhs_base = blockB + col*strideB + MICRO_NEW_ROWS*offsetB;
-  const Scalar* lhs_base = blockA + accCols*offsetA;
+  const Scalar* rhs_base = blockB + col * strideB + MICRO_NEW_ROWS * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
   Index row = 0;
 
 #define MAX_UNROLL 7
-  while(row + MAX_UNROLL*accCols <= rows) {
+  while (row + MAX_UNROLL * accCols <= rows) {
     MICRO_UNROLL_ITER2(MAX_UNROLL, 0);
   }
-  switch( (rows-row)/accCols ) {
+  switch ((rows - row) / accCols) {
 #if MAX_UNROLL > 7
     case 7:
       MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 7)
@@ -2321,59 +2164,50 @@
   }
 #undef MAX_UNROLL
 
-  if(remaining_rows > 0)
-  {
-    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+  if (remaining_rows > 0) {
+    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA,
+                                                                 strideB, row, rows, remaining_rows, pAlpha, pMask);
   }
 }
 
-#define MICRO_EXTRA_COLS(N) \
-  gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
+#define MICRO_EXTRA_COLS(N)                                                                                         \
+  gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, \
+                                                    col, rows, remaining_rows, pAlpha, pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
-  MICRO_EXTRA(MICRO_EXTRA_COLS, cols-col, true)
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                         Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+                                         Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+                                         const Packet& pMask) {
+  MICRO_EXTRA(MICRO_EXTRA_COLS, cols - col, true)
 }
 
 /****************
  * GEMM kernels *
  * **************/
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const Index remaining_rows = rows % accCols;
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
+                              Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA,
+                              Index offsetB) {
+  const Index remaining_rows = rows % accCols;
 
-      if( strideA == -1 ) strideA = depth;
-      if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
 
-      const Packet pAlpha = pset1<Packet>(alpha);
-      const Packet pMask  = bmask<Packet>(remaining_rows);
+  const Packet pAlpha = pset1<Packet>(alpha);
+  const Packet pMask = bmask<Packet>(remaining_rows);
 
-      Index col = 0;
-      for(; col + accRows <= cols; col += accRows)
-      {
-        gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
-      }
+  Index col = 0;
+  for (; col + accRows <= cols; col += accRows) {
+    gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB,
+                                                            offsetB, col, rows, remaining_rows, pAlpha, pMask);
+  }
 
-      if (col != cols)
-      {
-        gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
-      }
+  if (col != cols) {
+    gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+                                                         col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
 }
 
 #define accColsC (accCols / 2)
@@ -2384,129 +2218,128 @@
 #define PEEL_COMPLEX 3
 #define PEEL_COMPLEX_ROW 3
 
-#define MICRO_COMPLEX_UNROLL(func) \
-  func(0) func(1) func(2) func(3)
+#define MICRO_COMPLEX_UNROLL(func) func(0) func(1) func(2) func(3)
 
-#define MICRO_COMPLEX_ZERO_PEEL(peel) \
+#define MICRO_COMPLEX_ZERO_PEEL(peel)             \
   if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
-    bsetzero<Packet, accRows>(accReal##peel); \
-    bsetzero<Packet, accRows>(accImag##peel); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accReal##peel); \
-    EIGEN_UNUSED_VARIABLE(accImag##peel); \
+    bsetzero<Packet, accRows>(accReal##peel);     \
+    bsetzero<Packet, accRows>(accImag##peel);     \
+  } else {                                        \
+    EIGEN_UNUSED_VARIABLE(accReal##peel);         \
+    EIGEN_UNUSED_VARIABLE(accImag##peel);         \
   }
 
-#define MICRO_COMPLEX_ADD_ROWS(N, used) \
-  MICRO_ADD(ptr_real, N) \
-  if (!RhsIsReal) { \
-    MICRO_ADD(ptr_imag, N) \
-  } else if (used) { \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
+#define MICRO_COMPLEX_ADD_ROWS(N, used)            \
+  MICRO_ADD(ptr_real, N)                           \
+  if (!RhsIsReal) {                                \
+    MICRO_ADD(ptr_imag, N)                         \
+  } else if (used) {                               \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0)); \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1)); \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2)); \
   }
 
-#define MICRO_COMPLEX_BROADCAST(peel) \
-  MICRO_BROADCAST1(peel, ptr_real, rhsV, false) \
-  if (!RhsIsReal) { \
+#define MICRO_COMPLEX_BROADCAST(peel)              \
+  MICRO_BROADCAST1(peel, ptr_real, rhsV, false)    \
+  if (!RhsIsReal) {                                \
     MICRO_BROADCAST1(peel, ptr_imag, rhsVi, false) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);            \
   }
 
-#define MICRO_COMPLEX_BROADCAST_EXTRA \
-  Packet rhsV[4], rhsVi[4]; \
-  MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false) \
-  if(!RhsIsReal) { \
+#define MICRO_COMPLEX_BROADCAST_EXTRA              \
+  Packet rhsV[4], rhsVi[4];                        \
+  MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false)    \
+  if (!RhsIsReal) {                                \
     MICRO_BROADCAST_EXTRA1(ptr_imag, rhsVi, false) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsVi); \
-  } \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(rhsVi);                  \
+  }                                                \
   MICRO_COMPLEX_ADD_ROWS(1, true)
 
-#define MICRO_COMPLEX_SRC2_PTR \
-  MICRO_SRC2(ptr_real, strideB*advanceCols, 0) \
-  if (!RhsIsReal) { \
-    MICRO_RHS(ptr_imag,0) = rhs_base + MICRO_NEW_ROWS*strideB; \
-    MICRO_SRC2(ptr_imag, strideB*advanceCols, strideB) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
-    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
+#define MICRO_COMPLEX_SRC2_PTR                                    \
+  MICRO_SRC2(ptr_real, strideB* advanceCols, 0)                   \
+  if (!RhsIsReal) {                                               \
+    MICRO_RHS(ptr_imag, 0) = rhs_base + MICRO_NEW_ROWS * strideB; \
+    MICRO_SRC2(ptr_imag, strideB* advanceCols, strideB)           \
+  } else {                                                        \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0));                \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1));                \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2));                \
   }
 
 #define MICRO_COMPLEX_ZERO_PEEL_ROW MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_ZERO_PEEL)
 
-#define MICRO_COMPLEX_WORK_PEEL(peel) \
-  if (PEEL_COMPLEX_ROW > peel) { \
-    MICRO_COMPLEX_BROADCAST(peel) \
-    pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
-    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+#define MICRO_COMPLEX_WORK_PEEL(peel)                                                 \
+  if (PEEL_COMPLEX_ROW > peel) {                                                      \
+    MICRO_COMPLEX_BROADCAST(peel)                                                     \
+    pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+        &accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel),       \
+        lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel);             \
+  } else {                                                                            \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                                                \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);                                               \
   }
 
-#define MICRO_COMPLEX_ADD_COLS(size) \
-  lhs_ptr_real += (remaining_rows * size); \
-  if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * size); \
-  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#define MICRO_COMPLEX_ADD_COLS(size)         \
+  lhs_ptr_real += (remaining_rows * size);   \
+  if (!LhsIsReal)                            \
+    lhs_ptr_imag += (remaining_rows * size); \
+  else                                       \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
 
-#define MICRO_COMPLEX_WORK_PEEL_ROW \
-  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
+#define MICRO_COMPLEX_WORK_PEEL_ROW                  \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4];     \
   Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
-  MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL) \
-  MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW) \
+  MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL)      \
+  MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW)           \
   MICRO_COMPLEX_ADD_ROWS(PEEL_COMPLEX_ROW, false)
 
-#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
-  if (PEEL_COMPLEX_ROW > peel) { \
-    for (Index i = 0; i < accRows; i++) { \
+#define MICRO_COMPLEX_ADD_PEEL(peel, sum)                \
+  if (PEEL_COMPLEX_ROW > peel) {                         \
+    for (Index i = 0; i < accRows; i++) {                \
       accReal##sum.packet[i] += accReal##peel.packet[i]; \
       accImag##sum.packet[i] += accImag##peel.packet[i]; \
-    } \
+    }                                                    \
   }
 
 #define MICRO_COMPLEX_ADD_PEEL_ROW \
-  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
-  MICRO_COMPLEX_ADD_PEEL(1, 0)
+  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) MICRO_COMPLEX_ADD_PEEL(1, 0)
 
-template<typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
-  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
-  const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2,
-  const Scalar* &rhs_ptr_imag0, const Scalar* &rhs_ptr_imag1, const Scalar* &rhs_ptr_imag2,
-  PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
-{
+template <typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+          bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar*& lhs_ptr_real, const Scalar*& lhs_ptr_imag,
+                                                 const Scalar*& rhs_ptr_real0, const Scalar*& rhs_ptr_real1,
+                                                 const Scalar*& rhs_ptr_real2, const Scalar*& rhs_ptr_imag0,
+                                                 const Scalar*& rhs_ptr_imag1, const Scalar*& rhs_ptr_imag2,
+                                                 PacketBlock<Packet, accRows>& accReal,
+                                                 PacketBlock<Packet, accRows>& accImag) {
   MICRO_COMPLEX_BROADCAST_EXTRA
-  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real,
+                                                                                   lhs_ptr_imag, rhsV, rhsVi);
   MICRO_COMPLEX_ADD_COLS(1)
 }
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
-  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
-  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
-  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal,
+          const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                             const Scalar* rhs_base, Index depth, Index strideA,
+                                                             Index offsetA, Index strideB, Index row, Index rows,
+                                                             const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                             const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows * row * strideA + remaining_rows * offsetA;
   const Scalar* lhs_ptr_imag = NULL;
-  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
-  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
-  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
-  PacketBlock<Packet,accRows> taccReal, taccImag;
-  PacketBlock<Packetc,accRows> acc0, acc1;
-  PacketBlock<Packetc,accRows*2> tRes;
+  if (!LhsIsReal)
+    lhs_ptr_imag = lhs_ptr_real + remaining_rows * strideA;
+  else
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet, accRows> taccReal, taccImag;
+  PacketBlock<Packetc, accRows> acc0, acc1;
+  PacketBlock<Packetc, accRows * 2> tRes;
 
   MICRO_COMPLEX_SRC2_PTR
 
@@ -2517,45 +2350,43 @@
   Index k = 0;
   if (remaining_depth >= PEEL_COMPLEX_ROW) {
     MICRO_COMPLEX_ZERO_PEEL_ROW
-    do
-    {
+    do {
       MICRO_COMPLEX_PREFETCHN(accRows)
       EIGEN_POWER_PREFETCH(lhs_ptr_real);
-      if(!LhsIsReal) {
+      if (!LhsIsReal) {
         EIGEN_POWER_PREFETCH(lhs_ptr_imag);
       }
       MICRO_COMPLEX_WORK_PEEL_ROW
     } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
     MICRO_COMPLEX_ADD_PEEL_ROW
   }
-  for(; k < depth; k++)
-  {
-    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0);
+  for (; k < depth; k++) {
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(
+        lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1,
+        rhs_ptr_imag2, accReal0, accImag0);
   }
 
   constexpr bool full = (remaining_rows > accColsC);
   bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
-  if ((accRows == 1) || (rows >= accCols))
-  {
-    bscalec<Packet,accRows,true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+  if ((accRows == 1) || (rows >= accCols)) {
+    bscalec<Packet, accRows, true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
     bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
     bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
     if (full) {
       bstore<DataMapper, Packetc, accRows>(acc1, res, row + accColsC);
     }
   } else {
-    bscalec<Packet,accRows,false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bscalec<Packet, accRows, false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
     bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
 
-    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
-    {
-      for(Index j = 0; j < accRows; j++) {
+    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) {
+      for (Index j = 0; j < accRows; j++) {
         res(row + 0, j) = pfirst<Packetc>(acc0.packet[j]);
       }
     } else {
       bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
       if (full) {
-        for(Index j = 0; j < accRows; j++) {
+        for (Index j = 0; j < accRows; j++) {
           res(row + accColsC, j) = pfirst<Packetc>(acc1.packet[j]);
         }
       }
@@ -2563,59 +2394,51 @@
   }
 }
 
-#define MICRO_COMPLEX_EXTRA_ROWS(N) \
-  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_EXTRA_ROWS(N)                                                                        \
+  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, \
+                                      ConjugateRhs, LhsIsReal, RhsIsReal, N>(                              \
+      res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+                                                Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                                const Packet& pAlphaImag, const Packet& pMask) {
   MICRO_EXTRA(MICRO_COMPLEX_EXTRA_ROWS, remaining_rows, false)
 }
 
 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
-  MICRO_COMPLEX_UNROLL(func2); \
-  func(0,peel) func(1,peel) func(2,peel) func(3,peel)
+  MICRO_COMPLEX_UNROLL(func2);                       \
+  func(0, peel) func(1, peel) func(2, peel) func(3, peel)
 
-#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
-  if (unroll_factor > iter) { \
-    pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel)                                                \
+  if (unroll_factor > iter) {                                                              \
+    pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(       \
+        &accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
   }
 
 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
-  if (PEEL_COMPLEX > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
-    MICRO_COMPLEX_BROADCAST(peel) \
-    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
-    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+  if (PEEL_COMPLEX > peel) {                        \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3;              \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3;          \
+    MICRO_COMPLEX_BROADCAST(peel)                   \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel)    \
+  } else {                                          \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);              \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);             \
   }
 
 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
-  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
-  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
-  func(func1,func2,0) func(func1,func2,1) \
-  func(func1,func2,2) func(func1,func2,3)
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M];              \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M];          \
+  func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3)
 
 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
-  Packet rhsV0[M], rhsVi0[M];\
-  func(func1,func2,0)
+  Packet rhsV0[M], rhsVi0[M];                                \
+  func(func1, func2, 0)
 
-#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size) \
+#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size)                                        \
   MICRO_COMPLEX_TYPE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE) \
   MICRO_COMPLEX_ADD_ROWS(size, false)
 
@@ -2623,13 +2446,13 @@
 
 #define MICRO_COMPLEX_ONE4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_ONE, 1)
 
-#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
-  if (unroll_factor > iter) { \
+#define MICRO_COMPLEX_DST_PTR_ONE(iter)       \
+  if (unroll_factor > iter) {                 \
     bsetzero<Packet, accRows>(accReal##iter); \
     bsetzero<Packet, accRows>(accImag##iter); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accReal##iter); \
-    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accReal##iter);     \
+    EIGEN_UNUSED_VARIABLE(accImag##iter);     \
   }
 
 #define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
@@ -2638,59 +2461,52 @@
 
 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
 
-#define MICRO_COMPLEX_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
-    bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter*accCols, 0); \
-    bscalec<Packet,accRows,!(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \
-    bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1); \
-    bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter*accCols + 0); \
-    if (full) { \
-      bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter*accCols + accColsC); \
-    } \
+#define MICRO_COMPLEX_STORE_ONE(iter)                                                                               \
+  if (unroll_factor > iter) {                                                                                       \
+    constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC));                                          \
+    bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter * accCols, 0);        \
+    bscalec<Packet, accRows, !(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, \
+                                                    taccImag, pMask);                                               \
+    bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);                                  \
+    bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter * accCols + 0);                                      \
+    if (full) {                                                                                                     \
+      bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter * accCols + accColsC);                             \
+    }                                                                                                               \
   }
 
 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index& row,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
-  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
-  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
-  const Index imag_delta = accCols*strideA;
-  const Index imag_delta2 = accCols2*strideA;
-  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
-  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
-  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
-  PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
-  PacketBlock<Packet,accRows> taccReal, taccImag;
-  PacketBlock<Packetc,accRows> acc0, acc1;
-  PacketBlock<Packetc,accRows*2> tRes;
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper,
+          const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs,
+          bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                         const Scalar* rhs_base, Index depth, Index strideA,
+                                                         Index offsetA, Index strideB, Index& row,
+                                                         const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                         const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+  const Index imag_delta = accCols * strideA;
+  const Index imag_delta2 = accCols2 * strideA;
+  const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+  const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
+  PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet, accRows> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet, accRows> taccReal, taccImag;
+  PacketBlock<Packetc, accRows> acc0, acc1;
+  PacketBlock<Packetc, accRows * 2> tRes;
 
   MICRO_COMPLEX_SRC2_PTR
   MICRO_COMPLEX_SRC_PTR
   MICRO_COMPLEX_DST_PTR
 
   Index k = 0;
-  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
-  {
+  for (; k + PEEL_COMPLEX <= depth; k += PEEL_COMPLEX) {
     MICRO_COMPLEX_PREFETCHN(accRows)
     MICRO_COMPLEX_PREFETCH
     MICRO_COMPLEX_ONE_PEEL4
   }
-  for(; k < depth; k++)
-  {
+  for (; k < depth; k++) {
     MICRO_COMPLEX_ONE4
   }
   MICRO_COMPLEX_STORE
@@ -2698,38 +2514,29 @@
   MICRO_COMPLEX_UPDATE
 }
 
-#define MICRO_COMPLEX_UNROLL_ITER2(N, M) \
-  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_UNROLL_ITER2(N, M)                                                                  \
+  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, \
+                                  M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(     \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask);    \
   if (M) return;
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                           Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+                                           Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                           const Packet& pAlphaImag, const Packet& pMask) {
   const DataMapper res3 = res.getSubMapper(0, col);
 
-  const Scalar* rhs_base = blockB + advanceCols*col*strideB + MICRO_NEW_ROWS*offsetB;
-  const Scalar* lhs_base = blockA + accCols*offsetA;
+  const Scalar* rhs_base = blockB + advanceCols * col * strideB + MICRO_NEW_ROWS * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
   Index row = 0;
 
 #define MAX_COMPLEX_UNROLL 4
-  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+  while (row + MAX_COMPLEX_UNROLL * accCols <= rows) {
     MICRO_COMPLEX_UNROLL_ITER2(MAX_COMPLEX_UNROLL, 0);
   }
-  switch( (rows-row)/accCols ) {
+  switch ((rows - row) / accCols) {
 #if MAX_COMPLEX_UNROLL > 4
     case 4:
       MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4)
@@ -2755,87 +2562,81 @@
   }
 #undef MAX_COMPLEX_UNROLL
 
-  if(remaining_rows > 0)
-  {
-    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  if (remaining_rows > 0) {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                           RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows,
+                                      remaining_rows, pAlphaReal, pAlphaImag, pMask);
   }
 }
 
-#define MICRO_COMPLEX_EXTRA_COLS(N) \
-  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_EXTRA_COLS(N)                                                                         \
+  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+                    RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows,   \
+                               remaining_rows, pAlphaReal, pAlphaImag, pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
-  MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true)
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+          bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB,
+                                                 Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+                                                 const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                 const Packet& pMask) {
+  MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols - col, true)
 }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const Index remaining_rows = rows % accCols;
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+          typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
+                                      Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
 
-      if( strideA == -1 ) strideA = depth;
-      if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
 
-      const Packet pAlphaReal = pset1<Packet>(alpha.real());
-      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
-      const Packet pMask = bmask<Packet>(remaining_rows);
+  const Packet pAlphaReal = pset1<Packet>(alpha.real());
+  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+  const Packet pMask = bmask<Packet>(remaining_rows);
 
-      const Scalar* blockA = (Scalar *) blockAc;
-      const Scalar* blockB = (Scalar *) blockBc;
+  const Scalar* blockA = (Scalar*)blockAc;
+  const Scalar* blockB = (Scalar*)blockBc;
 
-      Index col = 0;
-      for(; col + accRows <= cols; col += accRows)
-      {
-        gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-      }
+  Index col = 0;
+  for (; col + accRows <= cols; col += accRows) {
+    gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                      RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows,
+                                 remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
 
-      if (col != cols)
-      {
-        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-      }
+  if (col != cols) {
+    gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                            RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+                                       remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
 }
 
 #undef accColsC
 #undef advanceCols
 #undef advanceRows
 
-EIGEN_ALWAYS_INLINE bool supportsMMA()
-{
+EIGEN_ALWAYS_INLINE bool supportsMMA() {
 #if defined(EIGEN_ALTIVEC_MMA_ONLY)
   return true;
 #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) && defined(__BUILTIN_CPU_SUPPORTS__)
-  return __builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma");
+  return __builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma");
 #else
   return false;  // No dynamic dispatch for LLVM or older GCC
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float* result)
-{
+EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float* result) {
   Packet4f result_block = ploadu<Packet4f>(result);
   return pmadd(acc, pAlpha, result_block);
 }
 
-template<bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows)
-{
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows) {
   if (lhsExtraRows) {
     pstoreu_partial(result, result_block, extra_rows);
   } else {
@@ -2844,31 +2645,30 @@
   result += rows;
 }
 
-template<bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows)
-{
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+                                      Index extra_cols, Index extra_rows) {
   Index x = 0;
   if (rhsExtraCols) {
-    do{
+    do {
       Packet4f result_block = loadAndMultiplyF32(acc[x], pAlpha, result);
       storeF32<lhsExtraRows>(result, result_block, rows, extra_rows);
     } while (++x < extra_cols);
   } else {
     Packet4f result_block[4];
-    float *result2 = result;
-    do{
+    float* result2 = result;
+    do {
       result_block[x] = loadAndMultiplyF32(acc[x], pAlpha, result);
       result += rows;
     } while (++x < 4);
     x = 0;
-    do{
+    do {
       storeF32<lhsExtraRows>(result2, result_block[x], rows, extra_rows);
     } while (++x < 4);
   }
 }
 
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data) {
   Packet8us z = pset1<Packet8us>(0);
 #ifdef _BIG_ENDIAN
   return reinterpret_cast<Packet4f>(vec_mergeh(data, z));
@@ -2877,8 +2677,7 @@
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data) {
   Packet8us z = pset1<Packet8us>(0);
 #ifdef _BIG_ENDIAN
   return reinterpret_cast<Packet4f>(vec_mergel(data, z));
@@ -2887,12 +2686,11 @@
 #endif
 }
 
-template<Index N, Index M>
-EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra = 0)
-{
+template <Index N, Index M>
+EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra = 0) {
   if (N < 4) {
-    pstoreu_partial(to +  0, oneConvertBF16Hi(block.packet[0].m_val), extra);
-  } else if (N >= (M*8+4)) {
+    pstoreu_partial(to + 0, oneConvertBF16Hi(block.packet[0].m_val), extra);
+  } else if (N >= (M * 8 + 4)) {
     pstoreu(to + 0, oneConvertBF16Hi(block.packet[M].m_val));
     if (N >= 8) {
       pstoreu(to + 4, oneConvertBF16Lo(block.packet[M].m_val));
@@ -2900,9 +2698,8 @@
   }
 }
 
-template<Index N>
-EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra)
-{
+template <Index N>
+EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra) {
   storeConvertTwoBF16<N, 0>(to + 0, block, extra);
   if (N >= 16) {
     storeConvertTwoBF16<N, 1>(to + 8, block);
@@ -2913,28 +2710,26 @@
   }
 }
 
-template<bool non_unit_stride, Index delta>
-EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16* src, Index resInc)
-{
+template <bool non_unit_stride, Index delta>
+EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16* src, Index resInc) {
   if (non_unit_stride) {
-    return pgather<bfloat16, Packet8bf>(src + delta*resInc, resInc);
+    return pgather<bfloat16, Packet8bf>(src + delta * resInc, resInc);
   } else {
     return ploadu<Packet8bf>(src + delta);
   }
 }
 
-static Packet16uc p16uc_MERGE16_32_1 = {  0, 1, 16,17,  2, 3, 18,19,  0, 1, 16,17,  2, 3, 18,19 };
-static Packet16uc p16uc_MERGE16_32_2 = {  4, 5, 20,21,  6, 7, 22,23,  4, 5, 20,21,  6, 7, 22,23 };
-static Packet16uc p16uc_MERGE16_32_3 = {  8, 9, 24,25, 10,11, 26,27,  8, 9, 24,25, 10,11, 26,27 };
-static Packet16uc p16uc_MERGE16_32_4 = { 12,13, 28,29, 14,15, 30,31, 12,13, 28,29, 14,15, 30,31 };
+static Packet16uc p16uc_MERGE16_32_1 = {0, 1, 16, 17, 2, 3, 18, 19, 0, 1, 16, 17, 2, 3, 18, 19};
+static Packet16uc p16uc_MERGE16_32_2 = {4, 5, 20, 21, 6, 7, 22, 23, 4, 5, 20, 21, 6, 7, 22, 23};
+static Packet16uc p16uc_MERGE16_32_3 = {8, 9, 24, 25, 10, 11, 26, 27, 8, 9, 24, 25, 10, 11, 26, 27};
+static Packet16uc p16uc_MERGE16_32_4 = {12, 13, 28, 29, 14, 15, 30, 31, 12, 13, 28, 29, 14, 15, 30, 31};
 
-static Packet16uc p16uc_MERGE16_32_5 = { 0,1, 16,17, 16,17, 16,17, 0,1, 16,17, 16,17, 16,17 };
-static Packet16uc p16uc_MERGE16_32_6 = { 2,3, 18,19, 18,19, 18,19, 2,3, 18,19, 18,19, 18,19 };
-static Packet16uc p16uc_MERGE16_32_7 = { 4,5, 20,21, 20,21, 20,21, 4,5, 20,21, 20,21, 20,21 };
-static Packet16uc p16uc_MERGE16_32_8 = { 6,7, 22,23, 22,23, 22,23, 6,7, 22,23, 22,23, 22,23 };
+static Packet16uc p16uc_MERGE16_32_5 = {0, 1, 16, 17, 16, 17, 16, 17, 0, 1, 16, 17, 16, 17, 16, 17};
+static Packet16uc p16uc_MERGE16_32_6 = {2, 3, 18, 19, 18, 19, 18, 19, 2, 3, 18, 19, 18, 19, 18, 19};
+static Packet16uc p16uc_MERGE16_32_7 = {4, 5, 20, 21, 20, 21, 20, 21, 4, 5, 20, 21, 20, 21, 20, 21};
+static Packet16uc p16uc_MERGE16_32_8 = {6, 7, 22, 23, 22, 23, 22, 23, 6, 7, 22, 23, 22, 23, 22, 23};
 
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask) {
   Packet8us z = pset1<Packet8us>(0);
 #ifdef _BIG_ENDIAN
   return reinterpret_cast<Packet4f>(vec_perm(data, z, mask));
@@ -2943,63 +2738,62 @@
 #endif
 }
 
-template<bool lhsExtraRows, bool odd, Index size>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float *result, Index rows, const bfloat16* src, Index extra_rows)
-{
-  Packet4f dup[4*4];
+template <bool lhsExtraRows, bool odd, Index size>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float* result, Index rows, const bfloat16* src,
+                                                            Index extra_rows) {
+  Packet4f dup[4 * 4];
   Packet8bf data[4];
 
   for (Index i = 0; i < size; i++) {
-    data[i] = ploadu<Packet8bf>(src + rows*i);
+    data[i] = ploadu<Packet8bf>(src + rows * i);
   }
 
   for (Index i = 0, j = 0; i < size; i++, j += 4) {
-    dup[j+0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
-    dup[j+1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
-    dup[j+2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
-    dup[j+3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
+    dup[j + 0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
+    dup[j + 1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
+    dup[j + 2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
+    dup[j + 3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
   }
 
-  for (Index j = 0; j < 4*size; j += 4) {
+  for (Index j = 0; j < 4 * size; j += 4) {
     if (lhsExtraRows) {
       Packet4f z = pset1<Packet4f>(float(0));
       Index i = 0;
       do {
-        pstoreu(result + (j+i)*4, dup[j+i]);
+        pstoreu(result + (j + i) * 4, dup[j + i]);
       } while (++i < extra_rows);
       do {
-        pstoreu(result + (j+i)*4, z);
+        pstoreu(result + (j + i) * 4, z);
       } while (++i < 4);
     } else {
       for (Index i = 0; i < 4; i++) {
-        pstoreu(result + (j+i)*4, dup[j+i]);
+        pstoreu(result + (j + i) * 4, dup[j + i]);
       }
     }
   }
 }
 
-template<bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float *result, Index cols, Index rows, const bfloat16* src, Index delta, Index extra_rows)
-{
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float* result, Index cols, Index rows, const bfloat16* src,
+                                                         Index delta, Index extra_rows) {
   Index col = 0;
-  src += delta*2;
-  for(; col + 4*2 <= cols; col += 4*2, result += 4*4*4, src += 4*rows) {
-    convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,4>(result, rows, src, extra_rows);
+  src += delta * 2;
+  for (; col + 4 * 2 <= cols; col += 4 * 2, result += 4 * 4 * 4, src += 4 * rows) {
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 4>(result, rows, src, extra_rows);
   }
-  for(; col + 2 <= cols; col += 2, result += 4*4, src += rows) {
-    convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,1>(result, rows, src, extra_rows);
+  for (; col + 2 <= cols; col += 2, result += 4 * 4, src += rows) {
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 1>(result, rows, src, extra_rows);
   }
   if (cols & 1) {
-    convertArrayPointerBF16toF32DupOne<lhsExtraRows,true,1>(result, rows, src - delta, extra_rows);
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, true, 1>(result, rows, src - delta, extra_rows);
   }
 }
 
-template<const Index size, bool non_unit_stride>
-EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index& i, float *result, Index rows, bfloat16*& src, Index resInc)
-{
+template <const Index size, bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index& i, float* result, Index rows, bfloat16*& src, Index resInc) {
   constexpr Index extra = ((size < 4) ? 4 : size);
   while (i + size <= rows) {
-    PacketBlock<Packet8bf,(size+7)/8> r32;
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
     r32.packet[0] = loadBF16fromResult<non_unit_stride, 0>(src, resInc);
     if (size >= 16) {
       r32.packet[1] = loadBF16fromResult<non_unit_stride, 8>(src, resInc);
@@ -3009,41 +2803,40 @@
       r32.packet[3] = loadBF16fromResult<non_unit_stride, 24>(src, resInc);
     }
     storeConvertBlockBF16<size>(result + i, r32, rows & 3);
-    i += extra; src += extra*resInc;
+    i += extra;
+    src += extra * resInc;
     if (size != 32) break;
   }
 }
 
-template<bool non_unit_stride>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16* src, Index resInc)
-{
-  for(Index col = 0; col < cols; col++, src += (rows*resInc), result += rows) {
+template <bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+                                                      Index resInc) {
+  for (Index col = 0; col < cols; col++, src += (rows * resInc), result += rows) {
     Index i = 0;
     bfloat16* src2 = src;
     convertPointerBF16toF32<32, non_unit_stride>(i, result, rows, src2, resInc);
     convertPointerBF16toF32<16, non_unit_stride>(i, result, rows, src2, resInc);
-    convertPointerBF16toF32<8,  non_unit_stride>(i, result, rows, src2, resInc);
-    convertPointerBF16toF32<4,  non_unit_stride>(i, result, rows, src2, resInc);
-    convertPointerBF16toF32<1,  non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<8, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<4, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<1, non_unit_stride>(i, result, rows, src2, resInc);
   }
 }
 
-template<Index num_acc, Index size = 4>
-EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f (&acc)[num_acc][size])
-{
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f (&acc)[num_acc][size]) {
   Packet4f z = pset1<Packet4f>(float(0));
 
-  for(Index k = 0; k < num_acc; k++) {
-    for(Index j = 0; j < size; j++) {
+  for (Index k = 0; k < num_acc; k++) {
+    for (Index j = 0; j < size; j++) {
       acc[k][j] = z;
     }
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f (&acc)[num_acc][4])
-{
-  for(Index i = 0; i < num_acc; i++) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f (&acc)[num_acc][4]) {
+  for (Index i = 0; i < num_acc; i++) {
     Packet4ui t0, t1, t2, t3;
     t0 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
     t1 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
@@ -3056,85 +2849,75 @@
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4])
-{
-  for(Index i = 0, j = 0; j < num_acc; i++, j += 2) {
-    for(Index x = 0, y = 0; x < 2; x++, y += 2) {
-      for(Index w = 0, z = 0; w < 2; w++, z += 2) {
-        acc[i][y+w] = acc[j+x][z+0] + acc[j+x][z+1];
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4]) {
+  for (Index i = 0, j = 0; j < num_acc; i++, j += 2) {
+    for (Index x = 0, y = 0; x < 2; x++, y += 2) {
+      for (Index w = 0, z = 0; w < 2; w++, z += 2) {
+        acc[i][y + w] = acc[j + x][z + 0] + acc[j + x][z + 1];
       }
     }
   }
 }
 
-template<Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
-EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result, const Index extra_cols, Index extra_rows)
-{
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
+EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+                                          const Index extra_cols, Index extra_rows) {
   tranposeResults<num_acc>(acc);
   addResults<num_acc>(acc);
 
   constexpr Index real_rhs = ((num_rhs / 2) - (rhsExtraCols ? 1 : 0));
   Index k = 0;
-  for(Index i = 0; i < real_rhs; i++, result += 4*rows, k++){
+  for (Index i = 0; i < real_rhs; i++, result += 4 * rows, k++) {
     storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
   }
-  if(rhsExtraCols) {
+  if (rhsExtraCols) {
     storeResults<rhsExtraCols, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
   }
 }
 
-template<bool zero>
-EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f &dhs1)
-{
-  dhs0 = ploadu<Packet4f>(block + strideB*i + 0);
+template <bool zero>
+EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f& dhs1) {
+  dhs0 = ploadu<Packet4f>(block + strideB * i + 0);
   if (zero) {
     Packet4f dhs2 = pset1<Packet4f>(float(0));
     dhs1 = vec_mergel(dhs0, dhs2);
     dhs0 = vec_mergeh(dhs0, dhs2);
   } else {
-    dhs1 = ploadu<Packet4f>(block + strideB*i + 4);
+    dhs1 = ploadu<Packet4f>(block + strideB * i + 4);
   }
 }
 
-template<Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
-EIGEN_ALWAYS_INLINE void KLoop
-(
-  const float* indexA,
-  const float* indexB,
-  Packet4f (&acc)[num_acc][4],
-  Index strideB,
-  Index k,
-  Index offsetB,
-  Index extra_cols
-)
-{
+template <Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
+EIGEN_ALWAYS_INLINE void KLoop(const float* indexA, const float* indexB, Packet4f (&acc)[num_acc][4], Index strideB,
+                               Index k, Index offsetB, Index extra_cols) {
   constexpr Index num_lhs = 4;
   Packet4f lhs[num_lhs], rhs[num_rhs];
 
   constexpr Index real_rhs = (num_rhs - (rhsExtraCols ? 2 : 0));
-  for(Index i = 0; i < real_rhs; i += 2){
-    loadTwoRhsFloat32<zero>(indexB + k*4, strideB, i, rhs[i + 0], rhs[i + 1]);
+  for (Index i = 0; i < real_rhs; i += 2) {
+    loadTwoRhsFloat32<zero>(indexB + k * 4, strideB, i, rhs[i + 0], rhs[i + 1]);
   }
-  if(rhsExtraCols) {
-    loadTwoRhsFloat32<zero>(indexB + k*extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
+  if (rhsExtraCols) {
+    loadTwoRhsFloat32<zero>(indexB + k * extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
   }
 
-  indexA += 2*k*4;
-  for(Index j = 0; j < num_lhs; j++) {
-    lhs[j] = ploadu<Packet4f>(indexA + j*4);
+  indexA += 2 * k * 4;
+  for (Index j = 0; j < num_lhs; j++) {
+    lhs[j] = ploadu<Packet4f>(indexA + j * 4);
   }
 
-  for(Index j = 0; j < num_rhs; j++) {
-    for(Index i = 0; i < num_lhs; i++) {
+  for (Index j = 0; j < num_rhs; j++) {
+    for (Index i = 0; i < num_lhs; i++) {
       acc[j][i] = pmadd(rhs[j], lhs[i], acc[j][i]);
     }
   }
 }
 
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result, const Index extra_cols, const Index extra_rows)
-{
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA,
+                                            const float* indexB, Index strideB, Index offsetB, float* result,
+                                            const Index extra_cols, const Index extra_rows) {
   constexpr Index num_rhs = num_acc;
 
   Packet4f acc[num_acc][4];
@@ -3142,10 +2925,10 @@
   zeroAccumulators<num_acc>(acc);
 
   Index k;
-  for(k = 0; k + 2 <= depth; k += 2){
+  for (k = 0; k + 2 <= depth; k += 2) {
     KLoop<num_acc, false, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
   }
-  if(depth&1){
+  if (depth & 1) {
     KLoop<num_acc, true, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
   }
 
@@ -3153,97 +2936,108 @@
 }
 
 // No more than 4 (uses 2X the accumulators or 8X the number of VSX registers)
-#define MAX_BFLOAT16_ACC_VSX   4
+#define MAX_BFLOAT16_ACC_VSX 4
 
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result)
-{
-  constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+                    const float* indexB, Index strideB, Index offsetB, float* result) {
+  constexpr Index step = (num_acc * 4);  // each accumulator has 4 elements
   const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
   const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
   constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC_VSX);
 
-  do{
-    colVSXLoopBodyIter<num_acc*2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
+  do {
+    colVSXLoopBodyIter<num_acc * 2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB,
+                                                                result, extra_cols, extra_rows);
 
-    indexB += strideB*(num_acc * 2);
-    result += rows*step;
-  } while(multiIters && (step <= cols - (col += step)));
+    indexB += strideB * (num_acc * 2);
+    result += rows * step;
+  } while (multiIters && (step <= cols - (col += step)));
 }
 
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+                                              const float* indexA, const float* blockB, Index strideB, Index offsetB,
+                                              float* result) {
   if (MAX_BFLOAT16_ACC_VSX > num_acc) {
-    colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+    colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA,
+                                                                                 blockB, strideB, offsetB, result);
   }
 }
 
-template<bool rhsExtraCols, bool lhsExtraRows>
-void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
-{
+template <bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+                         const float* blockB, Index strideB, Index offsetB, float* result) {
   switch ((cols - col) >> 2) {
-  case 3:
-    colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 2:
-    colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 1:
-    colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  default:
-    if (rhsExtraCols) {
-      colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    }
-    break;
+    case 3:
+      colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    case 2:
+      colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    case 1:
+      colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    default:
+      if (rhsExtraCols) {
+        colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+      }
+      break;
   }
 }
 
-template<Index size, bool lhsExtraRows = false>
-EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const float* indexA2, const float* blockB2, Index strideA, Index strideB, Index offsetB, float* result2)
-{
-  Index delta_rows = 2*(lhsExtraRows ? (rows & 3) : size);
+template <Index size, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                     const float* indexA2, const float* blockB2, Index strideA, Index strideB,
+                                     Index offsetB, float* result2) {
+  Index delta_rows = 2 * (lhsExtraRows ? (rows & 3) : size);
   for (Index row = 0; row < size; row += 4) {
-    convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float *>(indexA2), strideA, delta_rows, indexA, row, rows & 3);
+    convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float*>(indexA2), strideA, delta_rows, indexA, row,
+                                                  rows & 3);
 
-    const float *blockB = blockB2;
-    float *result = result2 + row;
+    const float* blockB = blockB2;
+    float* result = result2 + row;
 
     Index col = 0;
     if (cols >= (MAX_BFLOAT16_ACC_VSX * 4)) {
-      colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
-      blockB += (strideB >> 1)*col;
-      result += rows*col;
+      colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB,
+                                                                strideB, 0, result);
+      blockB += (strideB >> 1) * col;
+      result += rows * col;
     }
     if (cols & 3) {
-      colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB, result);
+      colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB,
+                                              result);
     } else {
       colVSXLoopBodyExtra<false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
     }
   }
 }
 
-template<Index size>
-EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexB, Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
-{
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth,
+                                         Index cols, Index rows, const Packet4f pAlpha, const float* indexB,
+                                         Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix,
+                                         float* result) {
   if ((size == 16) || (rows & size)) {
-    indexA += size*offsetA;
+    indexA += size * offsetA;
     colVSXLoops<size>(depth, cols, rows, pAlpha, indexA, indexA2, indexB, strideA, strideB, offsetB, result + row);
     row += size;
-    indexA += bigSuffix*size/16;
+    indexA += bigSuffix * size / 16;
   }
 }
 
-template<const Index size, typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float *result, Index rows, const DataMapper& src)
-{
+template <const Index size, typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float* result, Index rows, const DataMapper& src) {
   constexpr Index extra = ((size < 4) ? 4 : size);
   while (i + size <= rows) {
-    PacketBlock<Packet8bf,(size+7)/8> r32;
-    r32.packet[0] = src.template loadPacket<Packet8bf>(i +  0);
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = src.template loadPacket<Packet8bf>(i + 0);
     if (size >= 16) {
-      r32.packet[1] = src.template loadPacket<Packet8bf>(i +  8);
+      r32.packet[1] = src.template loadPacket<Packet8bf>(i + 8);
     }
     if (size >= 32) {
       r32.packet[2] = src.template loadPacket<Packet8bf>(i + 16);
@@ -3255,104 +3049,104 @@
   }
 }
 
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src) {
   typedef typename DataMapper::LinearMapper LinearMapper;
-  for(Index j = 0; j < cols; j++, result += rows){
+  for (Index j = 0; j < cols; j++, result += rows) {
     const LinearMapper src2 = src.getLinearMapper(0, j);
     Index i = 0;
     convertBF16toF32<32, LinearMapper>(i, result, rows, src2);
     convertBF16toF32<16, LinearMapper>(i, result, rows, src2);
-    convertBF16toF32<8,  LinearMapper>(i, result, rows, src2);
-    convertBF16toF32<4,  LinearMapper>(i, result, rows, src2);
-    convertBF16toF32<1,  LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<8, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<4, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<1, LinearMapper>(i, result, rows, src2);
   }
 }
 
-EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float *res)
-{
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float* res) {
   return F32ToBf16Both(ploadu<Packet4f>(res + 0), ploadu<Packet4f>(res + 4));
 }
 
-template<typename DataMapper, const Index size>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float *result, Index col, Index rows, const DataMapper& res)
-{
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float* result, Index col, Index rows, const DataMapper& res) {
   const DataMapper res2 = res.getSubMapper(0, col);
   Index row;
-  float *result2 = result + col*rows;
-  for(row = 0; row + 8 <= rows; row += 8, result2 += 8){
+  float* result2 = result + col * rows;
+  for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
     // get and save block
-    PacketBlock<Packet8bf,size> block;
-    for(Index j = 0; j < size; j++){
-      block.packet[j] = convertF32toBF16VSX(result2 + j*rows);
+    PacketBlock<Packet8bf, size> block;
+    for (Index j = 0; j < size; j++) {
+      block.packet[j] = convertF32toBF16VSX(result2 + j * rows);
     }
-    res2.template storePacketBlock<Packet8bf,size>(row, 0, block);
+    res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
   }
   // extra rows
-  if(row < rows){
-    for(Index j = 0; j < size; j++){
-      Packet8bf fp16 = convertF32toBF16VSX(result2 + j*rows);
+  if (row < rows) {
+    for (Index j = 0; j < size; j++) {
+      Packet8bf fp16 = convertF32toBF16VSX(result2 + j * rows);
       res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
     }
   }
 }
 
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float *result, Index cols, Index rows, const DataMapper& res)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float* result, Index cols, Index rows, const DataMapper& res) {
   Index col;
-  for(col = 0; col + 4 <= cols; col += 4){
-    convertArrayF32toBF16ColVSX<DataMapper,4>(result, col, rows, res);
+  for (col = 0; col + 4 <= cols; col += 4) {
+    convertArrayF32toBF16ColVSX<DataMapper, 4>(result, col, rows, res);
   }
   // extra cols
   switch (cols - col) {
-  case 1:
-    convertArrayF32toBF16ColVSX<DataMapper,1>(result, col, rows, res);
-    break;
-  case 2:
-    convertArrayF32toBF16ColVSX<DataMapper,2>(result, col, rows, res);
-    break;
-  case 3:
-    convertArrayF32toBF16ColVSX<DataMapper,3>(result, col, rows, res);
-    break;
+    case 1:
+      convertArrayF32toBF16ColVSX<DataMapper, 1>(result, col, rows, res);
+      break;
+    case 2:
+      convertArrayF32toBF16ColVSX<DataMapper, 2>(result, col, rows, res);
+      break;
+    case 3:
+      convertArrayF32toBF16ColVSX<DataMapper, 3>(result, col, rows, res);
+      break;
   }
 }
 
-template<typename DataMapper>
-void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename DataMapper>
+void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+                  Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
   float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
   const Packet4f pAlpha = pset1<Packet4f>(falpha);
 
-  if( strideA == -1 ) strideA = depth;
-  if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
 
-  ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
-  ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB*cols, 0);
-  ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2)*4*2, 0);
+  ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
+  ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB* cols, 0);
+  ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2) * 4 * 2, 0);
 
   convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
-  convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16 *>(indexB));
+  convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16*>(indexB));
 
-  Index bigSuffix = 2*8*(strideA-offsetA);
-  float* indexBF32 = indexB2 + 4*offsetB;
+  Index bigSuffix = 2 * 8 * (strideA - offsetA);
+  float* indexBF32 = indexB2 + 4 * offsetB;
   offsetB *= 3;
   strideB *= 2;
 
   Index row = 0;
   // LHS (8x16) block
-  while(row + 16 <= rows){
-    calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+  while (row + 16 <= rows) {
+    calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                        bigSuffix, result);
   }
   // LHS (8x8) block
-  calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+  calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                     bigSuffix, result);
   // LHS (8x4) block
-  calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+  calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                     bigSuffix, result);
   // extra rows
-  if(rows & 3){
+  if (rows & 3) {
     // This index is the beginning of remaining block.
-    colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB, result + row);
+    colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB,
+                         result + row);
   }
 
   // Convert back to bfloat16
@@ -3366,554 +3160,527 @@
 /************************************
  * ppc64le template specializations *
  * **********************************/
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
-    dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
-    pack(blockA, lhs, depth, rows, stride, offset);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-  ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
-    dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
-    pack(blockA, lhs, depth, rows, stride, offset);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
 }
 
 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 #endif
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
   dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-  ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
   dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
   dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-  ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
   dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
   dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
   dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
 #if EIGEN_ALTIVEC_USE_CUSTOM_PACK
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 #endif
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
   dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
   dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
-  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
 };
 
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
 
 // ********* gebp specializations *********
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef typename quad_traits<float>::vectortype   Packet;
-  typedef typename quad_traits<float>::rhstype      RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<float>::vectortype Packet;
+  typedef typename quad_traits<float>::rhstype RhsPacket;
 
-  void operator()(const DataMapper& res, const float* blockA, const float* blockB,
-                  Index rows, Index depth, Index cols, float alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols,
+                  float alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
-               Index rows, Index depth, Index cols, float alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<float>::rows;
-    const Index accCols = quad_traits<float>::size;
-    static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
-    #endif
-        &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols, float alpha,
+    Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index,
+                               Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+                      &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef Packet4f   Packet;
-  typedef Packet2cf  Packetc;
-  typedef Packet4f   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
 
   void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
-                  Index rows, Index depth, Index cols, std::complex<float> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+                  Index rows, Index depth, Index cols, std::complex<float> alpha, Index strideA = -1,
+                  Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
-               Index rows, Index depth, Index cols, std::complex<float> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<float>::rows;
-    const Index accCols = quad_traits<float>::size;
-    static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
-          Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
-    #endif
-        &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs,
+                 ConjugateRhs>::operator()(const DataMapper& res, const std::complex<float>* blockA,
+                                           const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+                                           std::complex<float> alpha, Index strideA, Index strideB, Index offsetA,
+                                           Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*, Index, Index,
+                               Index, std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>,
+                                                          float, Packet, Packetc, RhsPacket, DataMapper, accRows,
+                                                          accCols, ConjugateLhs, ConjugateRhs, false, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>,
+                                                     float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                     ConjugateLhs, ConjugateRhs, false, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef Packet4f   Packet;
-  typedef Packet2cf  Packetc;
-  typedef Packet4f   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
 
-  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
-                  Index rows, Index depth, Index cols, std::complex<float> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows,
+                  Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
-               Index rows, Index depth, Index cols, std::complex<float> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<float>::rows;
-    const Index accCols = quad_traits<float>::size;
-    static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
-          Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
-    #endif
-        &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+    std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*, Index, Index, Index,
+                               std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, true, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, true, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef Packet4f   Packet;
-  typedef Packet2cf  Packetc;
-  typedef Packet4f   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
 
-  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
-                  Index rows, Index depth, Index cols, std::complex<float> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows,
+                  Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
-               Index rows, Index depth, Index cols, std::complex<float> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<float>::rows;
-    const Index accCols = quad_traits<float>::size;
-    static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
-          Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
-    #endif
-        &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows, Index depth, Index cols,
+    std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*, Index, Index, Index,
+                               std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, false, true>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, false, true>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef typename quad_traits<double>::vectortype  Packet;
-  typedef typename quad_traits<double>::rhstype     RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<double>::vectortype Packet;
+  typedef typename quad_traits<double>::rhstype RhsPacket;
 
-  void operator()(const DataMapper& res, const double* blockA, const double* blockB,
-                  Index rows, Index depth, Index cols, double alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth,
+                  Index cols, double alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+                  Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
-               Index rows, Index depth, Index cols, double alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<double>::rows;
-    const Index accCols = quad_traits<double>::size;
-    static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
-    #endif
-        &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth, Index cols,
+    double alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index,
+                               Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+                      &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef quad_traits<double>::vectortype   Packet;
-  typedef Packet1cd  Packetc;
-  typedef quad_traits<double>::rhstype   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
 
   void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
-                  Index rows, Index depth, Index cols, std::complex<double> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+                  Index rows, Index depth, Index cols, std::complex<double> alpha, Index strideA = -1,
+                  Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
-               Index rows, Index depth, Index cols, std::complex<double> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<double>::rows;
-    const Index accCols = quad_traits<double>::size;
-    static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
-          Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
-    #endif
-        &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs,
+                 ConjugateRhs>::operator()(const DataMapper& res, const std::complex<double>* blockA,
+                                           const std::complex<double>* blockB, Index rows, Index depth, Index cols,
+                                           std::complex<double> alpha, Index strideA, Index strideB, Index offsetA,
+                                           Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*, Index,
+                               Index, Index, std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA())
+          ? &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
+                                              Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                              ConjugateRhs, false, false>
+          :
+#endif
+          &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
+                                         Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                         ConjugateRhs, false, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef quad_traits<double>::vectortype   Packet;
-  typedef Packet1cd  Packetc;
-  typedef quad_traits<double>::rhstype   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
 
-  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
-                  Index rows, Index depth, Index cols, std::complex<double> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows,
+                  Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
-               Index rows, Index depth, Index cols, std::complex<double> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<double>::rows;
-    const Index accCols = quad_traits<double>::size;
-    static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
-          Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
-    #endif
-        &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows, Index depth,
+    Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*, Index, Index, Index,
+                               std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, false, true>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, false, true>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef quad_traits<double>::vectortype   Packet;
-  typedef Packet1cd  Packetc;
-  typedef quad_traits<double>::rhstype   RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
 
-  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
-                  Index rows, Index depth, Index cols, std::complex<double> alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows,
+                  Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
-               Index rows, Index depth, Index cols, std::complex<double> alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    const Index accRows = quad_traits<double>::rows;
-    const Index accCols = quad_traits<double>::size;
-    static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
-          Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
-    #endif
-        &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows, Index depth,
+    Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*, Index, Index, Index,
+                               std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, true, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, true, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
-  typedef typename quad_traits<bfloat16>::vectortype   Packet;
-  typedef typename quad_traits<bfloat16>::rhstype      RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<bfloat16>::vectortype Packet;
+  typedef typename quad_traits<bfloat16>::rhstype RhsPacket;
 
-  void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
-                  Index rows, Index depth, Index cols, bfloat16 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth,
+                  Index cols, bfloat16 alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+                  Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-  ::operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
-               Index rows, Index depth, Index cols, bfloat16 alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB)
-  {
-    static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16, Index, Index, Index, Index) =
-    #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-      (supportsMMA()) ?
-        &Eigen::internal::gemmMMAbfloat16<DataMapper> :
-    #endif
-        &Eigen::internal::gemmbfloat16<DataMapper>;
-    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
-  }
-} // end namespace internal
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols,
+    bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16,
+                               Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMAbfloat16<DataMapper> :
+#endif
+                      &Eigen::internal::gemmbfloat16<DataMapper>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#endif  // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index fa1755f..e78ca5a 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -1,6 +1,6 @@
-//#define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
+// #define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
 #ifdef EIGEN_POWER_USE_PREFETCH
-#define EIGEN_POWER_PREFETCH(p)  prefetch(p)
+#define EIGEN_POWER_PREFETCH(p) prefetch(p)
 #else
 #define EIGEN_POWER_PREFETCH(p)
 #endif
@@ -16,158 +16,125 @@
 
 namespace internal {
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_row(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                        Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+                                        Index remaining_rows, const Packet& pAlpha, const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                         Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+                                         Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+                                         const Packet& pMask);
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+                                                Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                                const Packet& pAlphaImag, const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index cols,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask);
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+          bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB,
+                                                 Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+                                                 const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                 const Packet& pMask);
 
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src);
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src);
 
-template<const Index size, bool non_unit_stride, Index delta>
+template <const Index size, bool non_unit_stride, Index delta>
 EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra = 0);
 
-template<bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16* src, Index resInc = 1);
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+                                                      Index resInc = 1);
 
-template<bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows);
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+                                      Index extra_cols, Index extra_rows);
 
-template<Index num_acc, bool extraRows, Index size = 4>
-EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows);
+template <Index num_acc, bool extraRows, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+                                             Index extra_rows);
 
-template<Index num_acc, Index size = 4>
-EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha);
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha);
 
-template<typename RhsMapper, bool linear>
+template <typename RhsMapper, bool linear>
 EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j);
 
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs);
 
-template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col);
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N,
+          bool full = true>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                               Index col);
 
-template<typename DataMapper, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row);
 
 #ifdef USE_PARTIAL_PACKETS
-template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
-EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements);
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                                       Index elements);
 
-template<typename DataMapper, typename Packet, Index N>
-EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements);
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements);
 #endif
 
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha);
 
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask);
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+                                const Packet& pMask);
 
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask);
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+                                 const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+                                 const Packet& pMask);
 
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2);
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                 PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+                                 PacketBlock<Packetc, N>& acc2);
 
-#define MICRO_NORMAL(iter) \
-  (accCols == accCols2) || (unroll_factor != (iter + 1))
+#define MICRO_NORMAL(iter) (accCols == accCols2) || (unroll_factor != (iter + 1))
 
-#define MICRO_UNROLL_ITER1(func, N) \
-  switch (remaining_rows) { \
-    default: \
-      func(N, 0) \
-      break; \
-    case 1: \
-      func(N, 1) \
-      break; \
-    case 2: \
+#define MICRO_UNROLL_ITER1(func, N)          \
+  switch (remaining_rows) {                  \
+    default:                                 \
+      func(N, 0) break;                      \
+    case 1:                                  \
+      func(N, 1) break;                      \
+    case 2:                                  \
       if (sizeof(Scalar) == sizeof(float)) { \
-        func(N, 2) \
-      } \
-      break; \
-    case 3: \
+        func(N, 2)                           \
+      }                                      \
+      break;                                 \
+    case 3:                                  \
       if (sizeof(Scalar) == sizeof(float)) { \
-        func(N, 3) \
-      } \
-      break; \
+        func(N, 3)                           \
+      }                                      \
+      break;                                 \
   }
 
 #ifdef USE_PARTIAL_PACKETS
 #define MICRO_UNROLL_ITER(func, N) \
-  if (remaining_rows) { \
-    func(N, true); \
-  } else { \
-    func(N, false); \
+  if (remaining_rows) {            \
+    func(N, true);                 \
+  } else {                         \
+    func(N, false);                \
   }
 
-#define MICRO_NORMAL_PARTIAL(iter) \
-  full || (unroll_factor != (iter + 1))
+#define MICRO_NORMAL_PARTIAL(iter) full || (unroll_factor != (iter + 1))
 #else
 #define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
 #endif
@@ -176,37 +143,38 @@
 
 #define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
 
-#define MICRO_LOAD1(lhs_ptr, iter) \
-  if (unroll_factor > iter) { \
-    lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
+#define MICRO_LOAD1(lhs_ptr, iter)                               \
+  if (unroll_factor > iter) {                                    \
+    lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter);                \
     lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+  } else {                                                       \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter);                           \
   }
 
 #define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
 
-#define MICRO_COMPLEX_LOAD_ONE(iter) \
-  if (!LhsIsReal && (unroll_factor > iter)) { \
+#define MICRO_COMPLEX_LOAD_ONE(iter)                                                                       \
+  if (!LhsIsReal && (unroll_factor > iter)) {                                                              \
     lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
-  } \
-  MICRO_LOAD1(lhs_ptr_real, iter) \
+  } else {                                                                                                 \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter);                                                                    \
+  }                                                                                                        \
+  MICRO_LOAD1(lhs_ptr_real, iter)
 
-#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
-  if (unroll_factor > iter) { \
-    lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter)                                  \
+  if (unroll_factor > iter) {                                                   \
+    lhs_ptr##iter = lhs_base + (row + (iter * accCols)) * strideA * advRows -   \
+                    MICRO_NORMAL_COLS(iter, 0, (accCols - accCols2) * offsetA); \
+  } else {                                                                      \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter);                                       \
   }
 
 #define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
 
 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
 
-#define MICRO_PREFETCH1(lhs_ptr, iter) \
-  if (unroll_factor > iter) { \
+#define MICRO_PREFETCH1(lhs_ptr, iter)   \
+  if (unroll_factor > iter) {            \
     EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
   }
 
@@ -220,19 +188,18 @@
 #define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
 #endif
 
-#define MICRO_UPDATE \
-  if (accCols == accCols2) { \
-    MICRO_UPDATE_MASK \
+#define MICRO_UPDATE                \
+  if (accCols == accCols2) {        \
+    MICRO_UPDATE_MASK               \
     EIGEN_UNUSED_VARIABLE(offsetA); \
-    row += unroll_factor*accCols; \
+    row += unroll_factor * accCols; \
   }
 
-#define MICRO_COMPLEX_UPDATE \
-  MICRO_UPDATE \
-  if(LhsIsReal || (accCols == accCols2)) { \
-    EIGEN_UNUSED_VARIABLE(imag_delta2); \
+#define MICRO_COMPLEX_UPDATE                \
+  MICRO_UPDATE                              \
+  if (LhsIsReal || (accCols == accCols2)) { \
+    EIGEN_UNUSED_VARIABLE(imag_delta2);     \
   }
 
-
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 72e8c31..94c5dd2 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -37,14 +37,11 @@
 
 #define accColsC (accCols / 2)
 
-EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
-{
-  __builtin_mma_xxsetaccz(acc);
-}
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); }
 
-template<typename DataMapper, typename Packet, bool full>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
-{
+template <typename DataMapper, typename Packet, bool full>
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements,
+                                          __vector_quad* acc) {
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
@@ -61,9 +58,10 @@
   }
 }
 
-template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
-EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag)
-{
+template <typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal,
+                                                 const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal,
+                                                 __vector_quad* accImag) {
   constexpr bool full = (accCols2 > accColsC);
   PacketBlock<Packet, 4> resultReal, resultImag;
   __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
@@ -85,80 +83,70 @@
 }
 
 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
-{
-  if(NegativeAccumulate)
-  {
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+  if (NegativeAccumulate) {
     __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
   } else {
     __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
   }
 }
 
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
-{
-  if(NegativeAccumulate)
-  {
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) {
+  if (NegativeAccumulate) {
     __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
   } else {
     __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
   }
 }
 
-template<typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi, const RhsPacket& rhsV, RhsPacket& rhsVi)
-{
-  pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
-  if(LhsIsReal) {
-    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+template <typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi,
+                                  const RhsPacket& rhsV, RhsPacket& rhsVi) {
+  pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
+  if (LhsIsReal) {
+    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
     EIGEN_UNUSED_VARIABLE(lhsVi);
   } else {
-    if(!RhsIsReal) {
+    if (!RhsIsReal) {
       pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
-      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi,  lhsV);
+      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
     } else {
       EIGEN_UNUSED_VARIABLE(rhsVi);
     }
-    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag,  rhsV, lhsVi);
+    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
   }
 }
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet)* rhs)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet) * rhs) {
   return ploadu<Packet>(rhs);
 }
 
-template<typename Scalar, typename Packet>
-EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
-{
+template <typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) {
   rhsV = ploadRhs<Packet>(rhs);
-} 
+}
 
-template<>
-EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV)
-{
+template <>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) {
 #if EIGEN_COMP_LLVM
-  __builtin_vsx_assemble_pair(&rhsV,
-    reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
-    reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
+  __builtin_vsx_assemble_pair(
+      &rhsV, reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
+      reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
 #else
-  rhsV = *reinterpret_cast<__vector_pair *>(const_cast<double *>(rhs));
+  rhsV = *reinterpret_cast<__vector_pair*>(const_cast<double*>(rhs));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV)
-{
-  ploadRhsMMA(lhs, lhsV);
-}
+EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) { ploadRhsMMA(lhs, lhsV); }
 
 #define GEMM_MULTIPLE_COLS
 
 // Disable in GCC until unnecessary register moves are fixed
-//#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+// #if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
 #if EIGEN_COMP_LLVM
 #define VECTOR_PAIR_LOADS_LHS
 #endif
@@ -175,134 +163,127 @@
 #endif
 #endif
 
-#define MICRO_MMA_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+#define MICRO_MMA_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
 
-#define MICRO_MMA_WORK(func, type, peel) \
-  if (accItr == 1) { \
-    func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
-    func(4,type,peel,4,0) func(5,type,peel,5,0) func(6,type,peel,6,0) func(7,type,peel,7,0) \
-  } else if (accItr == 2) { \
-    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
-    func(4,type,peel,2,0) func(5,type,peel,2,1) func(6,type,peel,3,0) func(7,type,peel,3,1) \
-  } else { \
-    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
-    func(4,type,peel,1,0) func(5,type,peel,1,1) func(6,type,peel,1,2) func(7,type,peel,1,3) \
+#define MICRO_MMA_WORK(func, type, peel)                                                                        \
+  if (accItr == 1) {                                                                                            \
+    func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0)     \
+        func(4, type, peel, 4, 0) func(5, type, peel, 5, 0) func(6, type, peel, 6, 0) func(7, type, peel, 7, 0) \
+  } else if (accItr == 2) {                                                                                     \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1)     \
+        func(4, type, peel, 2, 0) func(5, type, peel, 2, 1) func(6, type, peel, 3, 0) func(7, type, peel, 3, 1) \
+  } else {                                                                                                      \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3)     \
+        func(4, type, peel, 1, 0) func(5, type, peel, 1, 1) func(6, type, peel, 1, 2) func(7, type, peel, 1, 3) \
   }
 
-#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
-  if (unroll_factor > left) { \
+#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right)                        \
+  if (unroll_factor > left) {                                                    \
     pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
   }
 
 #ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
-  if (unroll_factor > left) { \
+#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right)                                          \
+  if (unroll_factor > left) {                                                                      \
     pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
   }
 
-#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
-  if (unroll_factor > left) { \
-    if (MICRO_NORMAL(left)) { \
-      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
+#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left)                                                        \
+  if (unroll_factor > left) {                                                                     \
+    if (MICRO_NORMAL(left)) {                                                                     \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left);                   \
       __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
-      lhs_ptr##left += accCols*2; \
-    } else { \
-      lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
-      lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
-      lhs_ptr##left += accCols2*2; \
-      EIGEN_UNUSED_VARIABLE(plhsV##left); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhsV2##left); \
-    EIGEN_UNUSED_VARIABLE(plhsV##left); \
+      lhs_ptr##left += accCols * 2;                                                               \
+    } else {                                                                                      \
+      lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left);                                    \
+      lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2);                         \
+      lhs_ptr##left += accCols2 * 2;                                                              \
+      EIGEN_UNUSED_VARIABLE(plhsV##left);                                                         \
+    }                                                                                             \
+  } else {                                                                                        \
+    EIGEN_UNUSED_VARIABLE(lhsV2##left);                                                           \
+    EIGEN_UNUSED_VARIABLE(plhsV##left);                                                           \
   }
 
 #define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
 #endif
 
-#define MICRO_MMA_UNROLL_ITER(func, val) \
-  func(val,0) \
-  if (accItr > 1) { \
-    func(val,1) \
-    if (accItr > 2) { \
-      func(val,2) \
-      func(val,3) \
-    } \
+#define MICRO_MMA_UNROLL_ITER(func, val)                       \
+  func(val, 0) if (accItr > 1) {                               \
+    func(val, 1) if (accItr > 2) { func(val, 2) func(val, 3) } \
   }
 
-#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) \
-  ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
+#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
 
-#define MICRO_MMA_LOAD_ONE_RHS(peel) \
-  MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
+#define MICRO_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
 
-#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
-  if (PEEL_MMA > peel) { \
+#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel)              \
+  if (PEEL_MMA > peel) {                                           \
     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    MICRO_MMA_LOAD_ONE_RHS(peel) \
-    MICRO_MMA_UNROLL(funcl) \
-    MICRO_MMA_WORK(funcw, type, peel) \
+    MICRO_MMA_LOAD_ONE_RHS(peel)                                   \
+    MICRO_MMA_UNROLL(funcl)                                        \
+    MICRO_MMA_WORK(funcw, type, peel)                              \
   }
 
 #ifndef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
+#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type)                                                  \
   type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
-  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
-  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
-  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
-  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 1)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 2)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 3)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 4)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 5)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 6) MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 7)
 #else
-#define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
+#define MICRO_MMA_LOAD_TWO_RHS(peel1, right)                                                      \
   ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
   __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
 
-#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
-  if (PEEL_MMA > peel2) { \
-    PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
-    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
-    if (sizeof(type) == 16) { \
-      MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
-      MICRO_MMA_LOAD_ONE_RHS(peel1) \
-      MICRO_MMA_LOAD_ONE_RHS(peel2) \
-    } \
-    MICRO_MMA_UNROLL(funcl2) \
-    MICRO_MMA_WORK(funcw2, type, peel1) \
-    MICRO_MMA_WORK(funcw2, type, peel2) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
-    MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
+#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2)           \
+  if (PEEL_MMA > peel2) {                                                                  \
+    PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
+    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7;          \
+    if (sizeof(type) == 16) {                                                              \
+      MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1)                                 \
+    } else {                                                                               \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                 \
+      MICRO_MMA_LOAD_ONE_RHS(peel1)                                                        \
+      MICRO_MMA_LOAD_ONE_RHS(peel2)                                                        \
+    }                                                                                      \
+    MICRO_MMA_UNROLL(funcl2)                                                               \
+    MICRO_MMA_WORK(funcw2, type, peel1)                                                    \
+    MICRO_MMA_WORK(funcw2, type, peel2)                                                    \
+  } else {                                                                                 \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                   \
+    MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1)                                       \
   }
 
-#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
+#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type)                               \
   type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
-  __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
-  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
-  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
-  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \
-  MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7)
+  __vector_pair prhsV0, prhsV2, prhsV4, prhsV6;                                                         \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 4, 5)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 6, 7)
 #endif
 
 #define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
-  type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
-  MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
+  type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1];        \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)
 
-#define MICRO_MMA_UPDATE_RHS1(size, right) \
-  rhs_ptr##right += (accRows * size);
+#define MICRO_MMA_UPDATE_RHS1(size, right) rhs_ptr##right += (accRows * size);
 
-#define MICRO_MMA_UPDATE_RHS(size) \
-  MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
+#define MICRO_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
 
-#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
+#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size)             \
   MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
   MICRO_MMA_UPDATE_RHS(size)
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
 #else
-#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
+#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size)                                                    \
   MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
   MICRO_MMA_UPDATE_RHS(size)
 
@@ -311,10 +292,10 @@
 
 #define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
 
-#define MICRO_MMA_DST_PTR_ONE(iter) \
-  if (unroll_factor * accItr > iter) { \
-    bsetzeroMMA(&accZero##iter); \
-  } else { \
+#define MICRO_MMA_DST_PTR_ONE(iter)       \
+  if (unroll_factor * accItr > iter) {    \
+    bsetzeroMMA(&accZero##iter);          \
+  } else {                                \
     EIGEN_UNUSED_VARIABLE(accZero##iter); \
   }
 
@@ -324,50 +305,40 @@
 
 #define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
 
-#define MICRO_MMA_STORE_ONE(iter, left, right) \
-  if (unroll_factor > left) { \
-    storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left*accCols, res##right, pAlpha, accCols2, &accZero##iter); \
+#define MICRO_MMA_STORE_ONE(iter, left, right)                                                                 \
+  if (unroll_factor > left) {                                                                                  \
+    storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left * accCols, res##right, pAlpha, \
+                                                                     accCols2, &accZero##iter);                \
   }
 
-#define MICRO_MMA_ITER_UNROLL(func) \
-  if (accItr == 1) { \
-    func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
-    func(4,4,0) func(5,5,0) func(6,6,0) func(7,7,0) \
-  } else if (accItr == 2) { \
-    func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
-    func(4,2,0) func(5,2,1) func(6,3,0) func(7,3,1) \
-  } else { \
-    func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
-    func(4,1,0) func(5,1,1) func(6,1,2) func(7,1,3) \
+#define MICRO_MMA_ITER_UNROLL(func)                                                                                 \
+  if (accItr == 1) {                                                                                                \
+    func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) func(4, 4, 0) func(5, 5, 0) func(6, 6, 0) func(7, 7, 0) \
+  } else if (accItr == 2) {                                                                                         \
+    func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) func(4, 2, 0) func(5, 2, 1) func(6, 3, 0) func(7, 3, 1) \
+  } else {                                                                                                          \
+    func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) func(4, 1, 0) func(5, 1, 1) func(6, 1, 2) func(7, 1, 3) \
   }
 
 #define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
 
-#define MICRO_MMA_EXTRA_ROWS(right) \
-  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3##right, blockA, rhs_base + right*accRows*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+#define MICRO_MMA_EXTRA_ROWS(right)                                                                           \
+  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(                                               \
+      res3##right, blockA, rhs_base + right * accRows * strideB, depth, strideA, offsetA, strideB, row, rows, \
+      remaining_rows, pAlpha, pMask);
 
-#define MICRO_MMA_EXTRA_ROWS1(val, right) \
-  MICRO_MMA_EXTRA_ROWS(right);
+#define MICRO_MMA_EXTRA_ROWS1(val, right) MICRO_MMA_EXTRA_ROWS(right);
 
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
-  const DataMapper& res0,
-  const DataMapper& res1,
-  const DataMapper& res2,
-  const DataMapper& res3,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index strideB,
-  Index offsetA,
-  Index& row,
-  const Packet& pAlpha,
-  Index accCols2
-  )
-{
-  const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL, * rhs_ptr3 = NULL;
-  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+template <int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper,
+          const Index accRows, const Index accCols, bool full, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+                                                     const DataMapper& res2, const DataMapper& res3,
+                                                     const Scalar* lhs_base, const Scalar* rhs_base, Index depth,
+                                                     Index strideA, Index strideB, Index offsetA, Index& row,
+                                                     const Packet& pAlpha, Index accCols2) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL, *rhs_ptr3 = NULL;
+  const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+               *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
   __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
 
   if (accItr > 1) {
@@ -391,14 +362,12 @@
   MICRO_MMA_DST_PTR
 
   Index k = 0, depth2 = depth - PEEL_MMA;
-  for(; k <= depth2; k += PEEL_MMA)
-  {
+  for (; k <= depth2; k += PEEL_MMA) {
     EIGEN_POWER_PREFETCH(rhs_ptr);
     MICRO_MMA_PREFETCH
     MICRO_MMA_ONE_PEEL
   }
-  for(; k < depth; k++)
-  {
+  for (; k < depth; k++) {
     MICRO_MMA_ONE
   }
   MICRO_MMA_STORE
@@ -406,38 +375,29 @@
   MICRO_UPDATE
 }
 
-#define MICRO_MMA_UNROLL_ITER2(N, M) \
-  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
+#define MICRO_MMA_UNROLL_ITER2(N, M)                                                                                 \
+  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>( \
+      res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha,                 \
+      M ? remaining_rows : accCols);                                                                                 \
   if (M) return;
 
-#define MICRO_MMA_ROWS(n) \
-  while(row + n*accCols <= rows) { \
-    MICRO_MMA_UNROLL_ITER2(n, 0); \
+#define MICRO_MMA_ROWS(n)             \
+  while (row + n * accCols <= rows) { \
+    MICRO_MMA_UNROLL_ITER2(n, 0);     \
   }
 
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemmMMA_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlpha,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                      Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+                                      Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
   const DataMapper res30 = res.getSubMapper(0, col);
-  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
-  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
-  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
 
-  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
-  const Scalar* lhs_base = blockA + accCols*offsetA;
+  const Scalar* rhs_base = blockB + col * strideB + accRows * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
   Index row = 0;
 
 #define MAX_MMA_UNROLL 7
@@ -455,7 +415,7 @@
   } else {
     MICRO_MMA_ROWS(2);
   }
-  switch( (rows-row)/accCols ) {
+  switch ((rows - row) / accCols) {
 #if MAX_MMA_UNROLL > 7
     case 7:
       if (accItr == 1) {
@@ -508,42 +468,42 @@
   }
 #undef MAX_MMA_UNROLL
 
-  if(remaining_rows > 0)
-  {
+  if (remaining_rows > 0) {
     MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
   }
 }
 
-#define MICRO_MMA_COLS(n) \
-  for(; col + n*accRows <= cols; col += n*accRows) \
-  { \
-    gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
+#define MICRO_MMA_COLS(n)                                                                                          \
+  for (; col + n * accRows <= cols; col += n * accRows) {                                                          \
+    gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(                                     \
+        res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
   }
 
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
-void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const Index remaining_rows = rows % accCols;
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
+             Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
 
-      if( strideA == -1 ) strideA = depth;
-      if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
 
-      const Packet pAlpha = pset1<Packet>(alpha);
-      const Packet pMask  = bmask<Packet>(remaining_rows);
+  const Packet pAlpha = pset1<Packet>(alpha);
+  const Packet pMask = bmask<Packet>(remaining_rows);
 
-      typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
-      Index col = 0;
+  Index col = 0;
 #ifdef GEMM_MULTIPLE_COLS
-      MICRO_MMA_COLS(4);
-      MICRO_MMA_COLS(2);
+  MICRO_MMA_COLS(4);
+  MICRO_MMA_COLS(2);
 #endif
-      MICRO_MMA_COLS(1);
+  MICRO_MMA_COLS(1);
 
-      if (col != cols)
-      {
-        gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
-      }
+  if (col != cols) {
+    gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+                                                         col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
 }
 
 #define advanceRows ((LhsIsReal) ? 1 : 2)
@@ -556,133 +516,137 @@
 #define PEEL_COMPLEX_MMA 3
 #endif
 
-#define MICRO_COMPLEX_MMA_UNROLL(func) \
-  func(0) func(1) func(2) func(3)
+#define MICRO_COMPLEX_MMA_UNROLL(func) func(0) func(1) func(2) func(3)
 
-#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
-  if (accItr == 1) { \
-    func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
-  } else if (accItr == 2) { \
-    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
-  } else { \
-    func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+#define MICRO_COMPLEX_MMA_WORK(func, type, peel)                                                            \
+  if (accItr == 1) {                                                                                        \
+    func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
+  } else if (accItr == 2) {                                                                                 \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
+  } else {                                                                                                  \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
   }
 
-#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
-  if (unroll_factor > left) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right)                                        \
+  if (unroll_factor > left) {                                                                            \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(                            \
+        &accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
   }
 
 #ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
-  if (unroll_factor > left) { \
-    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], rhsV##right[peel], rhsVi##right[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right)                                    \
+  if (unroll_factor > left) {                                                                        \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(                        \
+        &accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], \
+        rhsV##right[peel], rhsVi##right[peel]);                                                      \
   }
 
-#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
-  if (!LhsIsReal && (unroll_factor > left)) { \
-    if (MICRO_NORMAL(left)) { \
-      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
+#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)                                                  \
+  if (!LhsIsReal && (unroll_factor > left)) {                                                       \
+    if (MICRO_NORMAL(left)) {                                                                       \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left);  \
       __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
-    } else { \
-      lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
-      lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
-      EIGEN_UNUSED_VARIABLE(plhsVi##left); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
-    EIGEN_UNUSED_VARIABLE(plhsVi##left); \
-  } \
+    } else {                                                                                        \
+      lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2);                  \
+      lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2);       \
+      EIGEN_UNUSED_VARIABLE(plhsVi##left);                                                          \
+    }                                                                                               \
+  } else {                                                                                          \
+    EIGEN_UNUSED_VARIABLE(lhsVi2##left);                                                            \
+    EIGEN_UNUSED_VARIABLE(plhsVi##left);                                                            \
+  }                                                                                                 \
   MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
 
 #define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
 #endif
 
-#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
-  ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
-  if (!RhsIsReal) { \
+#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right)                             \
+  ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]);    \
+  if (!RhsIsReal) {                                                          \
     ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
   }
 
-#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
-  MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
+#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
 
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
-  if (PEEL_COMPLEX_MMA > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
-    MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
-    MICRO_COMPLEX_MMA_UNROLL(funcl) \
-    MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
+  if (PEEL_COMPLEX_MMA > peel) {                              \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3;                        \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3;                    \
+    MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel)                      \
+    MICRO_COMPLEX_MMA_UNROLL(funcl)                           \
+    MICRO_COMPLEX_MMA_WORK(funcw, type, peel)                 \
   }
 
 #ifndef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
-  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type)                                                      \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+      rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1];                      \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)                                                                \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 1)                                                                \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 3)
 #else
-#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
-  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
-  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
-  if(!RhsIsReal) { \
+#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right)                                                      \
+  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1);    \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);            \
+  if (!RhsIsReal) {                                                                                       \
     ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
-    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1);        \
+  } else {                                                                                                \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                                                 \
   }
 
 #define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
-  if (PEEL_COMPLEX_MMA > peel2) { \
-    PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
-    PacketBlock<Packet,2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
-    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
-    __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
-    if (sizeof(type) == 16) { \
-      MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
-      EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
-      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
-      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
-    } \
-    MICRO_COMPLEX_MMA_UNROLL(funcl2) \
-    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
-    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
-    EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
-    MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
+  if (PEEL_COMPLEX_MMA > peel2) {                                                        \
+    PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23;                               \
+    PacketBlock<Packet, 2> lhsVi20, lhsVi21, lhsVi22, lhsVi23;                           \
+    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3;                                        \
+    __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3;                                    \
+    if (sizeof(type) == 16) {                                                            \
+      MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1)                       \
+    } else {                                                                             \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                               \
+      EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                              \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1);                                             \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2);                                             \
+    }                                                                                    \
+    MICRO_COMPLEX_MMA_UNROLL(funcl2)                                                     \
+    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1)                                          \
+    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2)                                          \
+  } else {                                                                               \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                 \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                                \
+    MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1)                             \
   }
 
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
-  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
-  __vector_pair prhsV0, prhsV2; \
-  __vector_pair prhsVi0, prhsVi2; \
-  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
-  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3)
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type)                                   \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+      rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1];                      \
+  __vector_pair prhsV0, prhsV2;                                                                                     \
+  __vector_pair prhsVi0, prhsVi2;                                                                                   \
+  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1)                                          \
+  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)
 #endif
 
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type)                              \
   type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)
 
 #define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
-  rhs_ptr_real##right += (accRows * size); \
-  if(!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
+  rhs_ptr_real##right += (accRows * size);         \
+  if (!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
 
-#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) \
-  MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
+#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
 
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size)                     \
   MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
   MICRO_COMPLEX_MMA_UPDATE_RHS(size);
 
 #ifndef VECTOR_PAIR_LOADS_LHS
 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
 #else
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
-  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size)                                     \
+  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, \
+                         MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket)                                          \
   MICRO_COMPLEX_MMA_UPDATE_RHS(size);
 
 #define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
@@ -691,12 +655,12 @@
 #define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
 
 #define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
-  if (unroll_factor * accItr > iter) { \
-    bsetzeroMMA(&accReal##iter); \
-    bsetzeroMMA(&accImag##iter); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(accReal##iter); \
-    EIGEN_UNUSED_VARIABLE(accImag##iter); \
+  if (unroll_factor * accItr > iter) {      \
+    bsetzeroMMA(&accReal##iter);            \
+    bsetzeroMMA(&accImag##iter);            \
+  } else {                                  \
+    EIGEN_UNUSED_VARIABLE(accReal##iter);   \
+    EIGEN_UNUSED_VARIABLE(accImag##iter);   \
   }
 
 #define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
@@ -705,61 +669,56 @@
 
 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
 
-#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
-  if (unroll_factor > left) { \
-    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>(row + left*accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right)                                                                 \
+  if (unroll_factor > left) {                                                                                          \
+    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>( \
+        row + left * accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter);              \
   }
 
-#define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
-  if (accItr == 1) { \
-    func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
-  } else if (accItr == 2) { \
-    func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
-  } else { \
-    func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+#define MICRO_COMPLEX_MMA_ITER_UNROLL(func)                 \
+  if (accItr == 1) {                                        \
+    func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) \
+  } else if (accItr == 2) {                                 \
+    func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) \
+  } else {                                                  \
+    func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) \
   }
 
 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
 
-#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
-  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3##right, blockA, rhs_base + right*accRows*(RhsIsReal ? 1 : 2)*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right)                                                                            \
+  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+                         RhsIsReal>(res3##right, blockA, rhs_base + right * accRows * (RhsIsReal ? 1 : 2) * strideB,   \
+                                    depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal,           \
+                                    pAlphaImag, pMask);
 
-#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) \
-  MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
-  const DataMapper& res0,
-  const DataMapper& res1,
-  const DataMapper& res2,
-  const DataMapper& res3,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index& row,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
-  const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL, * rhs_ptr_real3 = NULL;
-  const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL, * rhs_ptr_imag3 = NULL;
-  const Index imag_delta = accCols*strideA;
-  const Index imag_delta2 = accCols2*strideA;
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket,
+          typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+                                                             const DataMapper& res2, const DataMapper& res3,
+                                                             const Scalar* lhs_base, const Scalar* rhs_base,
+                                                             Index depth, Index strideA, Index offsetA, Index strideB,
+                                                             Index& row, const Packet& pAlphaReal,
+                                                             const Packet& pAlphaImag, const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL, *rhs_ptr_real3 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL, *rhs_ptr_imag3 = NULL;
+  const Index imag_delta = accCols * strideA;
+  const Index imag_delta2 = accCols2 * strideA;
 
-  if(!RhsIsReal) {
-    rhs_ptr_imag0 = rhs_base + accRows*strideB;
+  if (!RhsIsReal) {
+    rhs_ptr_imag0 = rhs_base + accRows * strideB;
   } else {
     EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
   }
   if (accItr > 1) {
-    if(!RhsIsReal) {
-      rhs_ptr_real1 = rhs_base + (2*accRows*strideB);
-      rhs_ptr_imag1 = rhs_base + (3*accRows*strideB);
+    if (!RhsIsReal) {
+      rhs_ptr_real1 = rhs_base + (2 * accRows * strideB);
+      rhs_ptr_imag1 = rhs_base + (3 * accRows * strideB);
     } else {
-      rhs_ptr_real1 = rhs_base + accRows*strideB;
+      rhs_ptr_real1 = rhs_base + accRows * strideB;
       EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
     }
   } else {
@@ -768,14 +727,14 @@
     EIGEN_UNUSED_VARIABLE(res1);
   }
   if (accItr > 2) {
-    if(!RhsIsReal) {
-      rhs_ptr_real2 = rhs_base + (4*accRows*strideB);
-      rhs_ptr_imag2 = rhs_base + (5*accRows*strideB);
-      rhs_ptr_real3 = rhs_base + (6*accRows*strideB);
-      rhs_ptr_imag3 = rhs_base + (7*accRows*strideB);
+    if (!RhsIsReal) {
+      rhs_ptr_real2 = rhs_base + (4 * accRows * strideB);
+      rhs_ptr_imag2 = rhs_base + (5 * accRows * strideB);
+      rhs_ptr_real3 = rhs_base + (6 * accRows * strideB);
+      rhs_ptr_imag3 = rhs_base + (7 * accRows * strideB);
     } else {
-      rhs_ptr_real2 = rhs_base + (2*accRows*strideB);
-      rhs_ptr_real3 = rhs_base + (3*accRows*strideB);
+      rhs_ptr_real2 = rhs_base + (2 * accRows * strideB);
+      rhs_ptr_real3 = rhs_base + (3 * accRows * strideB);
       EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
       EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
     }
@@ -787,25 +746,23 @@
     EIGEN_UNUSED_VARIABLE(res2);
     EIGEN_UNUSED_VARIABLE(res3);
   }
-  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
-  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+  const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
   __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
 
   MICRO_COMPLEX_MMA_SRC_PTR
   MICRO_COMPLEX_MMA_DST_PTR
 
   Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
-  for(; k <= depth2; k += PEEL_COMPLEX_MMA)
-  {
+  for (; k <= depth2; k += PEEL_COMPLEX_MMA) {
     EIGEN_POWER_PREFETCH(rhs_ptr_real);
-    if(!RhsIsReal) {
+    if (!RhsIsReal) {
       EIGEN_POWER_PREFETCH(rhs_ptr_imag);
     }
     MICRO_COMPLEX_MMA_PREFETCH
     MICRO_COMPLEX_MMA_ONE_PEEL
   }
-  for(; k < depth; k++)
-  {
+  for (; k < depth; k++) {
     MICRO_COMPLEX_MMA_ONE
   }
   MICRO_COMPLEX_MMA_STORE
@@ -813,39 +770,32 @@
   MICRO_COMPLEX_UPDATE
 }
 
-#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
-  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M)                                                                           \
+  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows,        \
+                                      accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal,      \
+                                      accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, \
+                                              strideB, row, pAlphaReal, pAlphaImag, pMask);                            \
   if (M) return;
 
-#define MICRO_COMPLEX_MMA_ROWS(n) \
-  while(row + n*accCols <= rows) { \
+#define MICRO_COMPLEX_MMA_ROWS(n)         \
+  while (row + n * accCols <= rows) {     \
     MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
   }
 
-template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
-  const DataMapper& res,
-  const Scalar* blockA,
-  const Scalar* blockB,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index offsetB,
-  Index col,
-  Index rows,
-  Index remaining_rows,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag,
-  const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper,
+          const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+          bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                              Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+                                              Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                              const Packet& pAlphaImag, const Packet& pMask) {
   const DataMapper res30 = res.getSubMapper(0, col);
-  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
-  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
-  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
 
-  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
-  const Scalar* lhs_base = blockA + accCols*offsetA;
+  const Scalar* rhs_base = blockB + advanceCols * col * strideB + accRows * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
   Index row = 0;
 
 #define MAX_COMPLEX_MMA_UNROLL 4
@@ -863,7 +813,7 @@
   } else {
     MICRO_COMPLEX_MMA_ROWS(1);
   }
-  switch( (rows-row)/accCols ) {
+  switch ((rows - row) / accCols) {
 #if MAX_COMPLEX_MMA_UNROLL > 3
     case 3:
       if (accItr == 1) {
@@ -890,59 +840,62 @@
   }
 #undef MAX_COMPLEX_MMA_UNROLL
 
-  if(remaining_rows > 0)
-  {
+  if (remaining_rows > 0) {
     MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
   }
 }
 
-#define MICRO_COMPLEX_MMA_COLS(n) \
-  for(; col + n*accRows <= cols; col += n*accRows) \
-  { \
-    gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_MMA_COLS(n)                                                                                      \
+  for (; col + n * accRows <= cols; col += n * accRows) {                                                              \
+    gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs,              \
+                         ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, \
+                                                                offsetB, col, rows, remaining_rows, pAlphaReal,        \
+                                                                pAlphaImag, pMask);                                    \
   }
 
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
-      const Index remaining_rows = rows % accCols;
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+          typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth,
+                     Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
 
-      if( strideA == -1 ) strideA = depth;
-      if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
 
-      const Packet pAlphaReal = pset1<Packet>(alpha.real());
-      const Packet pAlphaImag = pset1<Packet>(alpha.imag());
-      const Packet pMask = bmask<Packet>(remaining_rows);
+  const Packet pAlphaReal = pset1<Packet>(alpha.real());
+  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+  const Packet pMask = bmask<Packet>(remaining_rows);
 
-      const Scalar* blockA = (Scalar *) blockAc;
-      const Scalar* blockB = (Scalar *) blockBc;
+  const Scalar* blockA = (Scalar*)blockAc;
+  const Scalar* blockB = (Scalar*)blockBc;
 
-      typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
 
-      Index col = 0;
+  Index col = 0;
 #ifdef GEMM_MULTIPLE_COLS
-      MICRO_COMPLEX_MMA_COLS(4);
-      MICRO_COMPLEX_MMA_COLS(2);
+  MICRO_COMPLEX_MMA_COLS(4);
+  MICRO_COMPLEX_MMA_COLS(2);
 #endif
-      MICRO_COMPLEX_MMA_COLS(1);
+  MICRO_COMPLEX_MMA_COLS(1);
 
-      if (col != cols)
-      {
-        gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-      }
+  if (col != cols) {
+    gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                            RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+                                       remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
 }
 
 #undef accColsC
 #undef advanceRows
 #undef advanceCols
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
 #pragma GCC pop_options
 #endif
 
-#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-
+#endif  // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index 5094118..6ecec0e 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -11,11 +11,10 @@
 
 namespace internal {
 
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA)
-{
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA) {
   Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
-  if(zero){
+  if (zero) {
     Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
     return vec_mergeh(lhs1.m_val, lhs2.m_val);
   } else {
@@ -23,239 +22,243 @@
   }
 }
 
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i)
-{
-  return loadBfloat16<zero>(blockB + strideB*i);
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i) {
+  return loadBfloat16<zero>(blockB + strideB * i);
 }
 
-template<Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
-EIGEN_ALWAYS_INLINE void KLoop
-(
-  const bfloat16* indexA,
-  const bfloat16* indexB,
-  __vector_quad (&quad_acc)[num_acc],
-  Index strideB,
-  Index k,
-  Index offsetB,
-  Index extra_cols,
-  Index extra_rows
-)
-{
+template <Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs,
+          Index num_lhs>
+EIGEN_ALWAYS_INLINE void KLoop(const bfloat16* indexA, const bfloat16* indexB, __vector_quad (&quad_acc)[num_acc],
+                               Index strideB, Index k, Index offsetB, Index extra_cols, Index extra_rows) {
   Packet8bf lhs[num_lhs], rhs[num_rhs];
 
   BFLOAT16_UNROLL
-  for(Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++){
-    rhs[i] = loadRhsBfloat16<zero>(indexB + k*4, strideB, i);
+  for (Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++) {
+    rhs[i] = loadRhsBfloat16<zero>(indexB + k * 4, strideB, i);
   }
-  if(rhsExtraCols) {
-    rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k*extra_cols - offsetB, strideB, num_rhs - 1);
+  if (rhsExtraCols) {
+    rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k * extra_cols - offsetB, strideB, num_rhs - 1);
   }
 
-  indexA += k*(lhsExtraRows ? extra_rows : num_packets);
+  indexA += k * (lhsExtraRows ? extra_rows : num_packets);
   if (num_lhs == 1) {
     lhs[0] = loadBfloat16<zero>(indexA);
   } else {
     BFLOAT16_UNROLL
-    for(Index j = 0; j < num_lhs; j += 2) {
-      Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0)*(zero ? 4 : 8));
+    for (Index j = 0; j < num_lhs; j += 2) {
+      Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0) * (zero ? 4 : 8));
       if (zero) {
         Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
         lhs[j + 0] = vec_mergeh(lhs1.m_val, lhs2.m_val);
         lhs[j + 1] = vec_mergel(lhs1.m_val, lhs2.m_val);
       } else {
         lhs[j + 0] = lhs1;
-        lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1)*8);
+        lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1) * 8);
       }
     }
   }
 
   BFLOAT16_UNROLL
-  for(Index i = 0, x = 0; i < num_rhs; i++) {
+  for (Index i = 0, x = 0; i < num_rhs; i++) {
     BFLOAT16_UNROLL
-    for(Index j = 0; j < num_lhs; j++, x++) {
-      __builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val), reinterpret_cast<Packet16uc>(lhs[j].m_val));
+    for (Index j = 0; j < num_lhs; j++, x++) {
+      __builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val),
+                                 reinterpret_cast<Packet16uc>(lhs[j].m_val));
     }
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc]) {
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k++)
-    __builtin_mma_xxsetaccz(&(quad_acc[k]));
+  for (Index k = 0; k < num_acc; k++) __builtin_mma_xxsetaccz(&(quad_acc[k]));
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4]) {
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k++)
-    __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
+  for (Index k = 0; k < num_acc; k++) __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
 }
 
-template<Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
-EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result, const Index extra_cols, Index extra_rows)
-{
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
+EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+                                       const Index extra_cols, Index extra_rows) {
   BFLOAT16_UNROLL
-  for(Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4*rows){
+  for (Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4 * rows) {
     BFLOAT16_UNROLL
-    for(Index j = 0; j < num_lhs; j++, k++) {
-      storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j*4, extra_cols, extra_rows);
+    for (Index j = 0; j < num_lhs; j++, k++) {
+      storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j * 4, extra_cols, extra_rows);
     }
   }
-  if(rhsExtraCols) {
+  if (rhsExtraCols) {
     storeResults<rhsExtraCols, lhsExtraRows>(acc[num_acc - 1], rows, pAlpha, result, extra_cols, extra_rows);
   }
 }
 
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
-EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* indexB, Index strideB, Index offsetB, float* result, const Index extra_cols, const Index extra_rows)
-{
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
+EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                         const bfloat16* indexB, Index strideB, Index offsetB, float* result,
+                                         const Index extra_cols, const Index extra_rows) {
   constexpr Index num_lhs = multiIter ? (num_packets / 4) : 1;
   constexpr Index num_rhs = (num_acc + num_lhs - 1) / num_lhs;
 
-  for(Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8), indexB += (multiIter ? (num_rhs*strideB) : 0), result += (multiIter ? (4*rows*num_rhs) : 4)) {
+  for (Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8),
+             indexB += (multiIter ? (num_rhs * strideB) : 0), result += (multiIter ? (4 * rows * num_rhs) : 4)) {
     Packet4f acc[num_acc][4];
     __vector_quad quad_acc[num_acc];
 
     zeroAccumulators<num_acc>(quad_acc);
 
     Index k;
-    for(k = 0; k + 2 <= depth; k += 2){
-      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+    for (k = 0; k + 2 <= depth; k += 2) {
+      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+          indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
     }
-    if(depth&1){
-      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+    if (depth & 1) {
+      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+          indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
     }
 
     disassembleAccumulators<num_acc>(quad_acc, acc);
 
-    outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols, extra_rows);
+    outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols,
+                                                                         extra_rows);
   }
 }
 
-#define MAX_BFLOAT16_ACC   8
+#define MAX_BFLOAT16_ACC 8
 
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* indexB, Index strideB, Index offsetB, float* result)
-{
-  constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                 const bfloat16* indexB, Index strideB, Index offsetB, float* result) {
+  constexpr Index step = (num_acc * 4);  // each accumulator has 4 elements
   const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
   const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
   constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC);
   constexpr bool normIters = multiIters && ((num_acc % (num_packets / 4)) == 0);
 
-  do{
-    colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
+  do {
+    colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(
+        depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
 
-    indexB += strideB*num_acc;
-    result += rows*step;
-  } while(multiIters && (step <= cols - (col += step)));
+    indexB += strideB * num_acc;
+    result += rows * step;
+  } while (multiIters && (step <= cols - (col += step)));
 }
 
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+                                           const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB,
+                                           float* result) {
   if (MAX_BFLOAT16_ACC > num_acc) {
-    colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+    colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(
+        col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
   }
 }
 
-template<const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                      const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
   switch ((cols - col) >> 2) {
-  case 7:
-    colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 6:
-    colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 5:
-    colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 4:
-    colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 3:
-    colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 2:
-    colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  case 1:
-    colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    break;
-  default:
-    if (rhsExtraCols) {
-      colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
-    }
-    break;
+    case 7:
+      colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 6:
+      colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 5:
+      colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 4:
+      colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 3:
+      colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 2:
+      colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 1:
+      colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    default:
+      if (rhsExtraCols) {
+        colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                        offsetB, result);
+      }
+      break;
   }
 }
 
-template<const Index num_packets, bool lhsExtraRows = false>
-EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_packets, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                  const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
   Index col = 0;
   if (cols >= (MAX_BFLOAT16_ACC * 4)) {
-    colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result);
-    blockB += (strideB >> 2)*col;
-    result += rows*col;
+    colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, 0, result);
+    blockB += (strideB >> 2) * col;
+    result += rows * col;
   }
   if (cols & 3) {
-    colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+    colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+                                                      result);
   } else {
-    colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result);
+    colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0,
+                                                       result);
   }
 }
 
-EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float *res)
-{
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float* res) {
   Packet16uc fp16[2];
-  __vector_pair fp16_vp = *reinterpret_cast<__vector_pair *>(const_cast<float *>(res));
+  __vector_pair fp16_vp = *reinterpret_cast<__vector_pair*>(const_cast<float*>(res));
   __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(fp16), &fp16_vp);
   fp16[0] = __builtin_vsx_xvcvspbf16(fp16[0]);
   fp16[1] = __builtin_vsx_xvcvspbf16(fp16[1]);
   return vec_pack(reinterpret_cast<Packet4ui>(fp16[0]), reinterpret_cast<Packet4ui>(fp16[1]));
 }
 
-template<typename DataMapper, const Index size>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float *result, Index col, Index rows, const DataMapper& res)
-{
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float* result, Index col, Index rows, const DataMapper& res) {
   const DataMapper res2 = res.getSubMapper(0, col);
   Index row;
-  float *result2 = result + col*rows;
-  for(row = 0; row + 8 <= rows; row += 8, result2 += 8){
+  float* result2 = result + col * rows;
+  for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
     // get and save block
-    PacketBlock<Packet8bf,size> block;
+    PacketBlock<Packet8bf, size> block;
     BFLOAT16_UNROLL
-    for(Index j = 0; j < size; j++){
-      block.packet[j] = convertF32toBF16(result2 + j*rows);
+    for (Index j = 0; j < size; j++) {
+      block.packet[j] = convertF32toBF16(result2 + j * rows);
     }
-    res2.template storePacketBlock<Packet8bf,size>(row, 0, block);
+    res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
   }
   // extra rows
-  if(row < rows){
+  if (row < rows) {
     BFLOAT16_UNROLL
-    for(Index j = 0; j < size; j++){
-      Packet8bf fp16 = convertF32toBF16(result2 + j*rows);
+    for (Index j = 0; j < size; j++) {
+      Packet8bf fp16 = convertF32toBF16(result2 + j * rows);
       res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
     }
   }
 }
 
-template<const Index size, bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc = 1)
-{
+template <const Index size, bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst,
+                                                 Index resInc = 1) {
   constexpr Index extra = ((size < 8) ? 8 : size);
-  while (i + size <= rows){
-    PacketBlock<Packet8bf,(size+7)/8> r32;
-    r32.packet[0] = convertF32toBF16(result + i +  0);
+  while (i + size <= rows) {
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = convertF32toBF16(result + i + 0);
     if (size >= 16) {
-      r32.packet[1] = convertF32toBF16(result + i +  8);
+      r32.packet[1] = convertF32toBF16(result + i + 8);
     }
     if (size >= 32) {
       r32.packet[2] = convertF32toBF16(result + i + 16);
@@ -269,64 +272,64 @@
       storeBF16fromResult<size, non_unit_stride, 16>(dst, r32.packet[2], resInc);
       storeBF16fromResult<size, non_unit_stride, 24>(dst, r32.packet[3], resInc);
     }
-    i += extra; dst += extra*resInc;
+    i += extra;
+    dst += extra * resInc;
     if (size != 32) break;
   }
 }
 
-template<bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float *result, Index rows, bfloat16* dst, Index resInc = 1)
-{
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
   Index i = 0;
-  convertPointerF32toBF16<32,non_unit_stride>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16<16,non_unit_stride>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16<8,non_unit_stride>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16<1,non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<32, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<16, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<8, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<1, non_unit_stride>(i, result, rows, dst, resInc);
 }
 
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float *result, Index cols, Index rows, const DataMapper& res)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float* result, Index cols, Index rows, const DataMapper& res) {
   Index col;
-  for(col = 0; col + 4 <= cols; col += 4){
-    convertArrayF32toBF16Col<DataMapper,4>(result, col, rows, res);
+  for (col = 0; col + 4 <= cols; col += 4) {
+    convertArrayF32toBF16Col<DataMapper, 4>(result, col, rows, res);
   }
   // extra cols
   switch (cols - col) {
-  case 1:
-    convertArrayF32toBF16Col<DataMapper,1>(result, col, rows, res);
-    break;
-  case 2:
-    convertArrayF32toBF16Col<DataMapper,2>(result, col, rows, res);
-    break;
-  case 3:
-    convertArrayF32toBF16Col<DataMapper,3>(result, col, rows, res);
-    break;
+    case 1:
+      convertArrayF32toBF16Col<DataMapper, 1>(result, col, rows, res);
+      break;
+    case 2:
+      convertArrayF32toBF16Col<DataMapper, 2>(result, col, rows, res);
+      break;
+    case 3:
+      convertArrayF32toBF16Col<DataMapper, 3>(result, col, rows, res);
+      break;
   }
 }
 
-template<Index size>
-EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
-{
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows,
+                                      const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA,
+                                      Index offsetB, Index bigSuffix, float* result) {
   if ((size == 16) || (rows & size)) {
-    indexA += size*offsetA;
+    indexA += size * offsetA;
     colLoops<size>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
     row += size;
-    indexA += bigSuffix*size/16;
+    indexA += bigSuffix * size / 16;
   }
 }
 
-template<typename DataMapper>
-void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename DataMapper>
+void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+                     Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
   float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
   const Packet4f pAlpha = pset1<Packet4f>(falpha);
-  ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
+  ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
 
   convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
 
-  if( strideA == -1 ) strideA = depth;
-  if( strideB == -1 ) strideB = depth;
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
   // Packing is done in blocks.
   // There's 4 possible sizes of blocks
   // Blocks of 8 columns with 16 elements (8x16)
@@ -335,13 +338,13 @@
   // Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
 
   // Loop for LHS standard block (8x16)
-  Index bigSuffix = (2*8) * (strideA-offsetA);
-  indexB += 4*offsetB;
+  Index bigSuffix = (2 * 8) * (strideA - offsetA);
+  indexB += 4 * offsetB;
   strideB *= 4;
   offsetB *= 3;
 
   Index row = 0;
-  while(row + 16 <= rows){
+  while (row + 16 <= rows) {
     calcColLoops<16>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
   }
   // LHS (8x8) block
@@ -349,7 +352,7 @@
   // LHS (8x4) block
   calcColLoops<4>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
   // extra rows
-  if(rows & 3){
+  if (rows & 3) {
     // This index is the beginning of remaining block.
     colLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
   }
@@ -361,12 +364,11 @@
 #undef MAX_BFLOAT16_ACC
 
 #if !EIGEN_ALTIVEC_DISABLE_MMA
-template<Index num_acc, typename LhsMapper, bool zero>
-EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1)
-{
-  a0[k + 0] = lhs.template loadPacket<Packet8bf>(k*4, 0);
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1) {
+  a0[k + 0] = lhs.template loadPacket<Packet8bf>(k * 4, 0);
   if (!zero) {
-    b1 = lhs.template loadPacket<Packet8bf>(k*4, 1);
+    b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
   }
   if (num_acc > (k + 1)) {
     a0[k + 1] = vec_mergel(a0[k + 0].m_val, b1.m_val);
@@ -374,18 +376,17 @@
   a0[k + 0] = vec_mergeh(a0[k + 0].m_val, b1.m_val);
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0) {
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k++) {
-    __builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val), reinterpret_cast<Packet16uc>(a0[k].m_val));
+  for (Index k = 0; k < num_acc; k++) {
+    __builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val),
+                               reinterpret_cast<Packet16uc>(a0[k].m_val));
   }
 }
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
-EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc])
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc]) {
   Packet8bf a0[num_acc];
   Packet8bf b1 = pset1<Packet8bf>(Eigen::bfloat16(0));
   Packet8bf b0 = loadColData<RhsMapper, linear>(rhs, j);
@@ -398,23 +399,23 @@
 
   LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k += 2) {
+  for (Index k = 0; k < num_acc; k += 2) {
     loadVecLoop<num_acc, LhsSubMapper, zero>(k, lhs2, a0, b1);
   }
 
   multVec<num_acc>(quad_acc, a0, b0);
 }
 
-#define MAX_BFLOAT16_VEC_ACC   8
+#define MAX_BFLOAT16_VEC_ACC 8
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                       float* result) {
   constexpr Index step = (num_acc * 4);
   const Index extra_rows = (extraRows) ? (rows & 3) : 0;
   constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC);
 
-  do{
+  do {
     Packet4f acc[num_acc][4];
     __vector_quad quad_acc[num_acc];
 
@@ -423,7 +424,7 @@
     using LhsSubMapper = typename LhsMapper::SubMapper;
 
     LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
-    for(Index j = 0; j + 2 <= cend; j += 2) {
+    for (Index j = 0; j + 2 <= cend; j += 2) {
       vecColLoop<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, quad_acc);
     }
     if (cend & 1) {
@@ -435,56 +436,58 @@
     outputVecColResults<num_acc, extraRows>(acc, result, pAlpha, extra_rows);
 
     result += step;
-  } while(multiIters && (step <= rows - (row += step)));
+  } while (multiIters && (step <= rows - (row += step)));
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                 const Packet4f pAlpha, float* result) {
   if (MAX_BFLOAT16_VEC_ACC > num_acc) {
-    colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+    colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs,
+                                                                                              pAlpha, result);
   }
 }
 
-template<typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                const Packet4f pAlpha, float* result) {
   switch ((rows - row) >> 2) {
-  case 7:
-    colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 6:
-    colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 5:
-    colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 4:
-    colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 3:
-    colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 2:
-    colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 1:
-    colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  default:
-    if (extraRows) {
-      colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    }
-    break;
+    case 7:
+      colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    default:
+      if (extraRows) {
+        colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      }
+      break;
   }
 }
 
-template<typename LhsMapper, typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                         float* result) {
   Index row = 0;
   if (rows >= (MAX_BFLOAT16_VEC_ACC * 4)) {
-    colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+    colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha,
+                                                                                 result);
     result += row;
   }
   if (rows & 3) {
@@ -494,10 +497,10 @@
   }
 }
 
-template<typename RhsMapper, typename LhsMapper, typename = void>
+template <typename RhsMapper, typename LhsMapper, typename = void>
 struct UseMMAStride : std::false_type {
-  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
-  {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
     using RhsSubMapper = typename RhsMapper::SubMapper;
 
     RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -505,11 +508,12 @@
   }
 };
 
-template<typename RhsMapper, typename LhsMapper>
-struct UseMMAStride<RhsMapper, LhsMapper, std::enable_if_t<std::is_member_function_pointer<
-                           decltype(&RhsMapper::stride)>::value>> : std::true_type {
-  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
-  {
+template <typename RhsMapper, typename LhsMapper>
+struct UseMMAStride<RhsMapper, LhsMapper,
+                    std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+    : std::true_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
     using RhsSubMapper = typename RhsMapper::SubMapper;
 
     RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -521,14 +525,9 @@
   }
 };
 
-template<typename LhsMapper, typename RhsMapper>
-void gemvMMA_bfloat16_col(
-  Index rows, Index cols,
-  const LhsMapper& alhs,
-  const RhsMapper& rhs,
-  bfloat16* res, Index resIncr,
-  bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+void gemvMMA_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+                          Index resIncr, bfloat16 alpha) {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr == 1);
 
@@ -548,8 +547,7 @@
 
   convertArrayPointerBF16toF32(result, 1, rows, res);
 
-  for (Index j2 = 0; j2 < cols; j2 += block_cols)
-  {
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
     Index jend = numext::mini(j2 + block_cols, cols);
 
     using LhsSubMapper = typename LhsMapper::SubMapper;
@@ -561,11 +559,11 @@
   convertArrayPointerF32toBF16(result, rows, res);
 }
 
-static Packet16uc p16uc_ELEMENT_VEC3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
+static Packet16uc p16uc_ELEMENT_VEC3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+                                        0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k) {
   if (num_acc > (k + 1)) {
     acc[k][0] = vec_mergeh(acc[k][0], acc[k + 1][0]);
     acc[k][1] = vec_mergeo(acc[k][1], acc[k + 1][1]);
@@ -584,22 +582,22 @@
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4]) {
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k += 4) {
+  for (Index k = 0; k < num_acc; k += 4) {
     preduxVecResults2<num_acc>(acc, k + 0);
     if (num_acc > (k + 2)) {
       preduxVecResults2<num_acc>(acc, k + 2);
-      acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
     }
   }
 }
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
-EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j, Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+                                     Index extra_cols) {
   Packet8bf a0[num_acc], b0;
 
   if (extra) {
@@ -610,7 +608,7 @@
 
   const LhsMapper lhs2 = lhs.getSubMapper(0, j);
   BFLOAT16_UNROLL
-  for(Index k = 0; k < num_acc; k++) {
+  for (Index k = 0; k < num_acc; k++) {
     if (extra) {
       a0[k] = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
     } else {
@@ -621,11 +619,11 @@
   multVec<num_acc>(quad_acc, a0, b0);
 }
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc], Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc],
+                                 Index extra_cols) {
   Index j = 0;
-  for(; j + 8 <= cols; j += 8){
+  for (; j + 8 <= cols; j += 8) {
     multVecLoop<num_acc, LhsMapper, RhsMapper, false>(quad_acc, lhs, rhs, j, extra_cols);
   }
 
@@ -634,13 +632,13 @@
   }
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                    float* result) {
   constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC);
   const Index extra_cols = (cols & 7);
 
-  do{
+  do {
     Packet4f acc[num_acc][4];
     __vector_quad quad_acc[num_acc];
 
@@ -656,48 +654,48 @@
     outputVecResults<num_acc>(acc, result, pAlpha);
 
     result += num_acc;
-  } while(multiIters && (num_acc <= rows - (row += num_acc)));
+  } while (multiIters && (num_acc <= rows - (row += num_acc)));
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                              const Packet4f pAlpha, float* result) {
   if (MAX_BFLOAT16_VEC_ACC > num_acc) {
     colVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
   }
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                             const Packet4f pAlpha, float* result) {
   switch (rows - row) {
-  case 7:
-    colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 6:
-    colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 5:
-    colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 4:
-    colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 3:
-    colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 2:
-    colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 1:
-    colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
+    case 7:
+      colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
   }
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                      float* result) {
   Index row = 0;
   if (rows >= MAX_BFLOAT16_VEC_ACC) {
     colVecLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
@@ -706,14 +704,9 @@
   colVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(
-  Index rows, Index cols,
-  const LhsMapper& alhs,
-  const RhsMapper& rhs,
-  bfloat16* res, Index resIncr,
-  bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                              bfloat16* res, Index resIncr, bfloat16 alpha) {
   typedef typename RhsMapper::LinearMapper LinearMapper;
 
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
@@ -744,6 +737,6 @@
 #undef MAX_BFLOAT16_VEC_ACC
 #undef BFLOAT16_UNROLL
 
-}
-}
-#endif //EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+}  // namespace internal
+}  // namespace Eigen
+#endif  // EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 66e1088..90c0d39 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -24,11 +24,12 @@
 #endif
 #endif
 
-//#define USE_SLOWER_GEMV_MMA   // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved)
+// #define USE_SLOWER_GEMV_MMA   // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is
+// improved)
 
-//#define EIGEN_POWER_USE_GEMV_PREFETCH
+// #define EIGEN_POWER_USE_GEMV_PREFETCH
 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-#define EIGEN_POWER_GEMV_PREFETCH(p)  prefetch(p)
+#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
 #else
 #define EIGEN_POWER_GEMV_PREFETCH(p)
 #endif
@@ -61,58 +62,50 @@
 #endif
 
 #define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16))
-#define GEMV_IS_FLOAT           (ResPacketSize == (16 / sizeof(float)))
-#define GEMV_IS_SCALAR          (sizeof(ResPacket) != 16)
-#define GEMV_IS_COMPLEX_FLOAT   (ResPacketSize == (16 / sizeof(std::complex<float>)))
+#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
+#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
+#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
 
 /** \internal multiply and add and store results */
-template<typename ResPacket, typename ResScalar>
-EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data)
-{
-    pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
+template <typename ResPacket, typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) {
+  pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
 }
 
-template<typename ResScalar>
-EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data)
-{
-    *res += (alpha * data);
+template <typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) {
+  *res += (alpha * data);
 }
 
-#define GEMV_UNROLL(func, N) \
-  func(0, N) func(1, N) func(2, N) func(3, N) \
-  func(4, N) func(5, N) func(6, N) func(7, N)
+#define GEMV_UNROLL(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
 
-#define GEMV_UNROLL_HALF(func, N) \
-  func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+#define GEMV_UNROLL_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
 
 #define GEMV_GETN(N) (((N) * ResPacketSize) >> 2)
 
-#define GEMV_LOADPACKET_COL(iter) \
-  lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
+#define GEMV_LOADPACKET_COL(iter) lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
 
 #ifdef USE_GEMV_MMA
-#define GEMV_UNROLL3(func, N, which) \
-  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
-  func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
+#define GEMV_UNROLL3(func, N, which)                                                                          \
+  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+      func(6, N, which) func(7, N, which)
 
 #define GEMV_UNUSED_VAR(iter, N, which) \
-  if (GEMV_GETN(N) <= iter) { \
+  if (GEMV_GETN(N) <= iter) {           \
     EIGEN_UNUSED_VARIABLE(which##iter); \
   }
 
 #define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \
-  if (N <= iter) { \
-    EIGEN_UNUSED_VARIABLE(which##iter); \
+  if (N <= iter) {                            \
+    EIGEN_UNUSED_VARIABLE(which##iter);       \
   }
 
-#define GEMV_UNUSED_EXTRA(N, which) \
-  GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
+#define GEMV_UNUSED_EXTRA(N, which) GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
 
-#define GEMV_UNUSED(N, which) \
-  GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
+#define GEMV_UNUSED(N, which) GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
 
-#define GEMV_INIT_MMA(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
+#define GEMV_INIT_MMA(iter, N)         \
+  if (GEMV_GETN(N) > iter) {           \
     __builtin_mma_xxsetaccz(&e##iter); \
   }
 
@@ -120,354 +113,336 @@
 #define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
   GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1));
 #else
-#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
+#define GEMV_LOADPAIR_COL_MMA(iter1, iter2)                                     \
   const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \
-  b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src##iter1));
+  b##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src##iter1));
 #endif
 
-#define GEMV_LOAD1A_COL_MMA(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
-    if (GEMV_IS_FLOAT) { \
-      g##iter = GEMV_LOADPACKET_COL(iter); \
-      EIGEN_UNUSED_VARIABLE(b##iter); \
-    } else { \
+#define GEMV_LOAD1A_COL_MMA(iter, N)         \
+  if (GEMV_GETN(N) > iter) {                 \
+    if (GEMV_IS_FLOAT) {                     \
+      g##iter = GEMV_LOADPACKET_COL(iter);   \
+      EIGEN_UNUSED_VARIABLE(b##iter);        \
+    } else {                                 \
       GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \
-      EIGEN_UNUSED_VARIABLE(g##iter); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(b##iter); \
-    EIGEN_UNUSED_VARIABLE(g##iter); \
+      EIGEN_UNUSED_VARIABLE(g##iter);        \
+    }                                        \
+  } else {                                   \
+    EIGEN_UNUSED_VARIABLE(b##iter);          \
+    EIGEN_UNUSED_VARIABLE(g##iter);          \
   }
 
-#define GEMV_WORK1A_COL_MMA(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
-    if (GEMV_IS_FLOAT) { \
+#define GEMV_WORK1A_COL_MMA(iter, N)                                      \
+  if (GEMV_GETN(N) > iter) {                                              \
+    if (GEMV_IS_FLOAT) {                                                  \
       pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, a0, g##iter); \
-    } else { \
+    } else {                                                              \
       pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, b##iter, a0); \
-    } \
+    }                                                                     \
   }
 
 #define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN(N) > iter1) { \
-    if (GEMV_IS_FLOAT) { \
-      GEMV_LOADPAIR_COL_MMA(iter2, iter2) \
-      EIGEN_UNUSED_VARIABLE(b##iter3); \
-    } else { \
-      GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \
-      GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(b##iter2); \
-    EIGEN_UNUSED_VARIABLE(b##iter3); \
-  } \
-  EIGEN_UNUSED_VARIABLE(g##iter2); \
+  if (GEMV_GETN(N) > iter1) {                       \
+    if (GEMV_IS_FLOAT) {                            \
+      GEMV_LOADPAIR_COL_MMA(iter2, iter2)           \
+      EIGEN_UNUSED_VARIABLE(b##iter3);              \
+    } else {                                        \
+      GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1)      \
+      GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1)      \
+    }                                               \
+  } else {                                          \
+    EIGEN_UNUSED_VARIABLE(b##iter2);                \
+    EIGEN_UNUSED_VARIABLE(b##iter3);                \
+  }                                                 \
+  EIGEN_UNUSED_VARIABLE(g##iter2);                  \
   EIGEN_UNUSED_VARIABLE(g##iter3);
 
-#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN(N) > iter1) { \
-    if (GEMV_IS_FLOAT) { \
-      LhsPacket h[2]; \
+#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N)                          \
+  if (GEMV_GETN(N) > iter1) {                                                \
+    if (GEMV_IS_FLOAT) {                                                     \
+      LhsPacket h[2];                                                        \
       __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(h), &b##iter2); \
-      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]); \
-      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]); \
-    } else { \
-      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0); \
-      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0); \
-    } \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]);      \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]);      \
+    } else {                                                                 \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0);  \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0);  \
+    }                                                                        \
   }
 
 #if EIGEN_COMP_LLVM
-#define GEMV_LOAD_COL_MMA(N) \
-  if (GEMV_GETN(N) > 1) { \
+#define GEMV_LOAD_COL_MMA(N)                        \
+  if (GEMV_GETN(N) > 1) {                           \
     GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \
-  } else { \
-    GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \
+  } else {                                          \
+    GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)             \
   }
 
-#define GEMV_WORK_COL_MMA(N) \
-  if (GEMV_GETN(N) > 1) { \
+#define GEMV_WORK_COL_MMA(N)                        \
+  if (GEMV_GETN(N) > 1) {                           \
     GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \
-  } else { \
-    GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \
+  } else {                                          \
+    GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)             \
   }
 #else
-#define GEMV_LOAD_COL_MMA(N) \
-  GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
+#define GEMV_LOAD_COL_MMA(N) GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
 
-#define GEMV_WORK_COL_MMA(N) \
-  GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
+#define GEMV_WORK_COL_MMA(N) GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
 #endif
 
-#define GEMV_DISASSEMBLE_MMA(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
+#define GEMV_DISASSEMBLE_MMA(iter, N)                              \
+  if (GEMV_GETN(N) > iter) {                                       \
     __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \
-    if (!GEMV_IS_FLOAT) { \
-      result##iter.packet[0][1] = result##iter.packet[1][0]; \
-      result##iter.packet[2][1] = result##iter.packet[3][0]; \
-    } \
+    if (!GEMV_IS_FLOAT) {                                          \
+      result##iter.packet[0][1] = result##iter.packet[1][0];       \
+      result##iter.packet[2][1] = result##iter.packet[3][0];       \
+    }                                                              \
   }
 
 #define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
-  b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
+  b##iter1 = *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize));
 
 #define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN(N) > iter1) { \
-    if (GEMV_IS_FLOAT) { \
-      GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \
-      EIGEN_UNUSED_VARIABLE(b##iter3); \
-    } else { \
-      GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \
-      GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(b##iter2); \
-    EIGEN_UNUSED_VARIABLE(b##iter3); \
+  if (GEMV_GETN(N) > iter1) {                      \
+    if (GEMV_IS_FLOAT) {                           \
+      GEMV_LOADPAIR2_COL_MMA(iter2, iter2);        \
+      EIGEN_UNUSED_VARIABLE(b##iter3);             \
+    } else {                                       \
+      GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1);   \
+      GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1);   \
+    }                                              \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(b##iter2);               \
+    EIGEN_UNUSED_VARIABLE(b##iter3);               \
   }
 
 #if EIGEN_COMP_LLVM
-#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
-  ResPacket f##iter2[2]; \
-  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
-  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4)                                         \
+  ResPacket f##iter2[2];                                                                    \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2);             \
+  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]);                        \
   f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
   GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
 #else
-#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
-  if (GEMV_IS_FLOAT) { \
-    __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
-  } else { \
-    __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4)                                        \
+  if (GEMV_IS_FLOAT) {                                                                     \
+    __asm__("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3"                                \
+            : "+&d"(b##iter2)                                                              \
+            : "wa"(result##iter3.packet[0]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
+  } else {                                                                                 \
+    __asm__("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3"                                \
+            : "+&d"(b##iter2)                                                              \
+            : "wa"(result##iter2.packet[2]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
   }
 #endif
 
-#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN(N) > iter1) { \
-    if (GEMV_IS_FLOAT) { \
-      GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \
-    } else { \
+#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N)      \
+  if (GEMV_GETN(N) > iter1) {                           \
+    if (GEMV_IS_FLOAT) {                                \
+      GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2);      \
+    } else {                                            \
       GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \
       GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \
-    } \
+    }                                                   \
   }
 
 #define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \
-  *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
+  *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
 
-#define GEMV_STORE_COL_MMA(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
-    if (GEMV_IS_FLOAT) { \
+#define GEMV_STORE_COL_MMA(iter, N)                                                                          \
+  if (GEMV_GETN(N) > iter) {                                                                                 \
+    if (GEMV_IS_FLOAT) {                                                                                     \
       storeMaddData<ResPacket, ResScalar>(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \
-    } else { \
-      GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \
-      GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \
-      GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \
-    } \
+    } else {                                                                                                 \
+      GEMV_LOADPAIR2_COL_MMA(iter, iter << 1)                                                                \
+      GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1)                                                          \
+      GEMV_STOREPAIR2_COL_MMA(iter, iter << 1)                                                               \
+    }                                                                                                        \
   }
 
 #define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN(N) > iter1) { \
-    if (GEMV_IS_FLOAT) { \
-      GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \
-    } else { \
-      GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \
-      GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \
-    } \
+  if (GEMV_GETN(N) > iter1) {                       \
+    if (GEMV_IS_FLOAT) {                            \
+      GEMV_STOREPAIR2_COL_MMA(iter2, iter2);        \
+    } else {                                        \
+      GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1)    \
+      GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1)    \
+    }                                               \
   }
 
-#define GEMV_PROCESS_COL_ONE_MMA(N) \
-  GEMV_UNROLL(GEMV_INIT_MMA, N) \
-  Index j = j2; \
-  __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \
-  do { \
-    LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \
-    RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
-    GEMV_UNROLL(GEMV_PREFETCH, N) \
-    GEMV_LOAD_COL_MMA(N) \
-    GEMV_WORK_COL_MMA(N) \
-  } while (++j < jend); \
-  GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \
-  if (GEMV_GETN(N) <= 1) { \
-    GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \
-  } else { \
-    GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \
-    GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \
+#define GEMV_PROCESS_COL_ONE_MMA(N)                 \
+  GEMV_UNROLL(GEMV_INIT_MMA, N)                     \
+  Index j = j2;                                     \
+  __vector_pair b0, b1, b2, b3, b4, b5, b6, b7;     \
+  do {                                              \
+    LhsPacket g0, g1, g2, g3, g4, g5, g6, g7;       \
+    RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0));    \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                   \
+    GEMV_LOAD_COL_MMA(N)                            \
+    GEMV_WORK_COL_MMA(N)                            \
+  } while (++j < jend);                             \
+  GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N)              \
+  if (GEMV_GETN(N) <= 1) {                          \
+    GEMV_UNROLL(GEMV_STORE_COL_MMA, N)              \
+  } else {                                          \
+    GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1))  \
+    GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1))  \
     GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \
-  } \
+  }                                                 \
   i += (ResPacketSize * N);
 #endif
 
-#define GEMV_INIT(iter, N) \
-  if (N > iter) { \
+#define GEMV_INIT(iter, N)                    \
+  if (N > iter) {                             \
     c##iter = pset1<ResPacket>(ResScalar(0)); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(c##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(c##iter);           \
   }
 
 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-#define GEMV_PREFETCH(iter, N) \
-  if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \
+#define GEMV_PREFETCH(iter, N)                                   \
+  if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) {  \
     lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \
   }
 #else
 #define GEMV_PREFETCH(iter, N)
 #endif
 
-#define GEMV_WORK_COL(iter, N) \
-  if (N > iter) { \
+#define GEMV_WORK_COL(iter, N)                                   \
+  if (N > iter) {                                                \
     c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \
   }
 
-#define GEMV_STORE_COL(iter, N) \
-  if (N > iter) { \
-    pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
+#define GEMV_STORE_COL(iter, N)                                                           \
+  if (N > iter) {                                                                         \
+    pstoreu(res + i + (iter * ResPacketSize),                                             \
+            pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
   }
 
 /** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */
-#define GEMV_PROCESS_COL_ONE(N) \
-  GEMV_UNROLL(GEMV_INIT, N) \
-  Index j = j2; \
-  do { \
+#define GEMV_PROCESS_COL_ONE(N)                  \
+  GEMV_UNROLL(GEMV_INIT, N)                      \
+  Index j = j2;                                  \
+  do {                                           \
     RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
-    GEMV_UNROLL(GEMV_PREFETCH, N) \
-    GEMV_UNROLL(GEMV_WORK_COL, N) \
-  } while (++j < jend); \
-  GEMV_UNROLL(GEMV_STORE_COL, N) \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                \
+    GEMV_UNROLL(GEMV_WORK_COL, N)                \
+  } while (++j < jend);                          \
+  GEMV_UNROLL(GEMV_STORE_COL, N)                 \
   i += (ResPacketSize * N);
 
 #ifdef USE_GEMV_MMA
-#define GEMV_PROCESS_COL(N) \
-  GEMV_PROCESS_COL_ONE_MMA(N)
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE_MMA(N)
 #else
-#define GEMV_PROCESS_COL(N) \
-  GEMV_PROCESS_COL_ONE(N)
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE(N)
 #endif
 
 /** \internal perform a matrix multiply and accumulate of packet a and packet b */
 #ifdef USE_GEMV_MMA
-template<typename LhsPacket, typename RhsPacket, bool accumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
-{
-    if (accumulate)
-    {
-        __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
-    }
-    else
-    {
-        __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
-    }
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+  if (accumulate) {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
 }
 
 /** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool accumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b)
-{
-    if (accumulate)
-    {
-        __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
-    }
-    else
-    {
-        __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
-    }
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) {
+  if (accumulate) {
+    __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
+  }
 }
 #endif
 
-template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_col(
-    Index rows, Index cols,
-    const LhsMapper& alhs,
-    const RhsMapper& rhs,
-    ResScalar* res, Index resIncr,
-    ResScalar alpha)
-{
-    typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+                                  Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
 
-    typedef typename Traits::LhsPacket LhsPacket;
-    typedef typename Traits::RhsPacket RhsPacket;
-    typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-    EIGEN_UNUSED_VARIABLE(resIncr);
-    eigen_internal_assert(resIncr == 1);
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
 
-    // The following copy tells the compiler that lhs's attributes are not modified outside this function
-    // This helps GCC to generate proper code.
-    LhsMapper lhs(alhs);
-    RhsMapper rhs2(rhs);
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
 
-    conj_helper<LhsScalar, RhsScalar, false, false> cj;
-    conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+  conj_helper<LhsScalar, RhsScalar, false, false> cj;
+  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
 
-    const Index lhsStride = lhs.stride();
-    // TODO: for padded aligned inputs, we could enable aligned reads
-    enum {
-        LhsAlignment = Unaligned,
-        ResPacketSize = Traits::ResPacketSize,
-        LhsPacketSize = Traits::LhsPacketSize,
-        RhsPacketSize = Traits::RhsPacketSize,
-    };
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    RhsPacketSize = Traits::RhsPacketSize,
+  };
 
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    const Index n8 = rows - 8 * ResPacketSize + 1;
-    const Index n4 = rows - 4 * ResPacketSize + 1;
-    const Index n2 = rows - 2 * ResPacketSize + 1;
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
 #endif
-    const Index n1 = rows - 1 * ResPacketSize + 1;
+  const Index n1 = rows - 1 * ResPacketSize + 1;
 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-    const Index prefetch_dist = 64 * LhsPacketSize;
+  const Index prefetch_dist = 64 * LhsPacketSize;
 #endif
 
-    // TODO: improve the following heuristic:
-    const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
-    ResPacket palpha = pset1<ResPacket>(alpha);
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+  ResPacket palpha = pset1<ResPacket>(alpha);
 
-    for (Index j2 = 0; j2 < cols; j2 += block_cols)
-    {
-        Index jend = numext::mini(j2 + block_cols, cols);
-        Index i = 0;
-        ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
 #ifdef USE_GEMV_MMA
-        __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
-        PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
-        GEMV_UNUSED(8, e)
-        GEMV_UNUSED(8, result)
-        GEMV_UNUSED_EXTRA(1, c)
+    __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
+    PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
+    GEMV_UNUSED(8, e)
+    GEMV_UNUSED(8, result)
+    GEMV_UNUSED_EXTRA(1, c)
 #endif
 #ifndef GCC_ONE_VECTORPAIR_BUG
-        while (i < n8)
-        {
-            GEMV_PROCESS_COL(8)
-        }
-        if (i < n4)
-        {
-            GEMV_PROCESS_COL(4)
-        }
-        if (i < n2)
-        {
-            GEMV_PROCESS_COL(2)
-        }
-        if (i < n1)
-#else
-        while (i < n1)
-#endif
-        {
-            GEMV_PROCESS_COL_ONE(1)
-        }
-        for (;i < rows;++i)
-        {
-            ResScalar d0(0);
-            Index j = j2;
-            do {
-                d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
-            } while (++j < jend);
-            res[i] += alpha * d0;
-        }
+    while (i < n8) {
+      GEMV_PROCESS_COL(8)
     }
+    if (i < n4) {
+      GEMV_PROCESS_COL(4)
+    }
+    if (i < n2) {
+      GEMV_PROCESS_COL(2)
+    }
+    if (i < n1)
+#else
+    while (i < n1)
+#endif
+    {
+      GEMV_PROCESS_COL_ONE(1)
+    }
+    for (; i < rows; ++i) {
+      ResScalar d0(0);
+      Index j = j2;
+      do {
+        d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+      } while (++j < jend);
+      res[i] += alpha * d0;
+    }
+  }
 }
 
-template<bool extraRows>
-EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float *result, Packet4f pAlpha, Index extra_rows)
-{
+template <bool extraRows>
+EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float* result, Packet4f pAlpha, Index extra_rows) {
   Packet4f d0 = ploadu<Packet4f>(result);
   d0 = pmadd(acc, pAlpha, d0);
   if (extraRows) {
@@ -477,28 +452,27 @@
   }
 }
 
-template<Index num_acc, bool extraRows, Index size>
-EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
-{
+template <Index num_acc, bool extraRows, Index size>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+                                             Index extra_rows) {
   constexpr Index real_acc = (num_acc - (extraRows ? 1 : 0));
-  for(Index k = 0; k < real_acc; k++) {
-    outputVecCol<false>(acc[k][0], result + k*4, pAlpha, extra_rows);
+  for (Index k = 0; k < real_acc; k++) {
+    outputVecCol<false>(acc[k][0], result + k * 4, pAlpha, extra_rows);
   }
   if (extraRows) {
-    outputVecCol<true>(acc[real_acc][0], result + real_acc*4, pAlpha, extra_rows);
+    outputVecCol<true>(acc[real_acc][0], result + real_acc * 4, pAlpha, extra_rows);
   }
 }
 
-static Packet16uc p16uc_MERGE16_32_V1 = {  0, 1, 16,17,  0, 1, 16,17,  0, 1, 16,17,  0, 1, 16,17 };
-static Packet16uc p16uc_MERGE16_32_V2 = {  2, 3, 18,19,  2, 3, 18,19,  2, 3, 18,19,  2, 3, 18,19 };
+static Packet16uc p16uc_MERGE16_32_V1 = {0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17};
+static Packet16uc p16uc_MERGE16_32_V2 = {2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19};
 
-template<Index num_acc, typename LhsMapper, bool zero>
-EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2])
-{
-  Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k*4, 0);
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2]) {
+  Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k * 4, 0);
   Packet8bf b1;
   if (!zero) {
-    b1 = lhs.template loadPacket<Packet8bf>(k*4, 1);
+    b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
 
     a0[k + 0][1] = oneConvertBF16Hi(b1.m_val);
   }
@@ -512,22 +486,19 @@
   }
 }
 
-template<Index num_acc, bool zero>
-EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2])
-{
-  for(Index k = 0; k < num_acc; k++) {
-    for(Index i = 0; i < (zero ? 1 : 2); i++) {
+template <Index num_acc, bool zero>
+EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2]) {
+  for (Index k = 0; k < num_acc; k++) {
+    for (Index i = 0; i < (zero ? 1 : 2); i++) {
       acc[k][i] = pmadd(b0[i], a0[k][i], acc[k][i]);
     }
   }
 }
 
-template<typename RhsMapper, bool linear>
-struct loadColData_impl
-{
+template <typename RhsMapper, bool linear>
+struct loadColData_impl {
   // linear == false
-  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
-  {
+  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
     const Index n = unpacket_traits<Packet8bf>::size;
     EIGEN_ALIGN16 bfloat16 to[n];
     LOAD_STORE_UNROLL_16
@@ -538,25 +509,21 @@
   }
 };
 
-template<typename RhsMapper>
-struct loadColData_impl<RhsMapper, true>
-{
+template <typename RhsMapper>
+struct loadColData_impl<RhsMapper, true> {
   // linear == true
-  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
-  {
+  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
     return rhs.template loadPacket<Packet8bf>(j + 0, 0);
   }
 };
 
-template<typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j)
-{
+template <typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j) {
   return loadColData_impl<RhsMapper, linear>::run(rhs, j);
 }
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
-EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2])
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2]) {
   Packet4f a0[num_acc][2], b0[2];
   Packet8bf b2 = loadColData<RhsMapper, linear>(rhs, j);
 
@@ -568,32 +535,31 @@
   using LhsSubMapper = typename LhsMapper::SubMapper;
 
   LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
-  for(Index k = 0; k < num_acc; k += 2) {
+  for (Index k = 0; k < num_acc; k += 2) {
     loadVecLoopVSX<num_acc, LhsSubMapper, zero>(k, lhs2, a0);
   }
 
   multVecVSX<num_acc, zero>(acc, a0, b0);
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f (&acc)[num_acc][2])
-{
-  for(Index i = 0; i < num_acc; i++) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f (&acc)[num_acc][2]) {
+  for (Index i = 0; i < num_acc; i++) {
     acc[i][0] = acc[i][0] + acc[i][1];
   }
 }
 
 // Uses 2X the accumulators or 4X the number of VSX registers
-#define MAX_BFLOAT16_VEC_ACC_VSX   8
+#define MAX_BFLOAT16_VEC_ACC_VSX 8
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                          float* result) {
   constexpr Index step = (num_acc * 4);
   const Index extra_rows = (extraRows) ? (rows & 3) : 0;
   constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
 
-  do{
+  do {
     Packet4f acc[num_acc][2];
 
     zeroAccumulators<num_acc, 2>(acc);
@@ -601,7 +567,7 @@
     using LhsSubMapper = typename LhsMapper::SubMapper;
 
     LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
-    for(Index j = 0; j + 2 <= cend; j += 2) {
+    for (Index j = 0; j + 2 <= cend; j += 2) {
       vecColLoopVSX<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, acc);
     }
     if (cend & 1) {
@@ -613,56 +579,58 @@
     outputVecColResults<num_acc, extraRows, 2>(acc, result, pAlpha, extra_rows);
 
     result += step;
-  } while(multiIters && (step <= rows - (row += step)));
+  } while (multiIters && (step <= rows - (row += step)));
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                    const Packet4f pAlpha, float* result) {
   if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
-    colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+    colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs,
+                                                                                                 rhs, pAlpha, result);
   }
 }
 
-template<typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                   const Packet4f pAlpha, float* result) {
   switch ((rows - row) >> 2) {
-  case 7:
-    colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 6:
-    colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 5:
-    colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 4:
-    colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 3:
-    colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 2:
-    colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 1:
-    colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    break;
-  default:
-    if (extraRows) {
-      colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
-    }
-    break;
+    case 7:
+      colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    default:
+      if (extraRows) {
+        colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      }
+      break;
   }
 }
 
-template<typename LhsMapper, typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                            const Packet4f pAlpha, float* result) {
   Index row = 0;
   if (rows >= (MAX_BFLOAT16_VEC_ACC_VSX * 4)) {
-    colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+    colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs,
+                                                                                        pAlpha, result);
     result += row;
   }
   if (rows & 3) {
@@ -672,14 +640,13 @@
   }
 }
 
-template<const Index size, bool inc, Index delta>
-EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra)
-{
+template <const Index size, bool inc, Index delta>
+EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra) {
   if (inc) {
     if (size < 8) {
-      pscatter_partial(dst + delta*resInc, data, resInc, extra);
+      pscatter_partial(dst + delta * resInc, data, resInc, extra);
     } else {
-      pscatter(dst + delta*resInc, data, resInc);
+      pscatter(dst + delta * resInc, data, resInc);
     }
   } else {
     if (size < 8) {
@@ -690,15 +657,15 @@
   }
 }
 
-template<const Index size, bool inc = false>
-EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc = 1)
-{
+template <const Index size, bool inc = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst,
+                                                    Index resInc = 1) {
   constexpr Index extra = ((size < 8) ? 8 : size);
   while (i + size <= rows) {
-    PacketBlock<Packet8bf,(size+7)/8> r32;
-    r32.packet[0] = convertF32toBF16VSX(result + i +  0);
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = convertF32toBF16VSX(result + i + 0);
     if (size >= 16) {
-      r32.packet[1] = convertF32toBF16VSX(result + i +  8);
+      r32.packet[1] = convertF32toBF16VSX(result + i + 8);
     }
     if (size >= 32) {
       r32.packet[2] = convertF32toBF16VSX(result + i + 16);
@@ -712,25 +679,25 @@
       storeBF16fromResult<size, inc, 16>(dst, r32.packet[2], resInc);
       storeBF16fromResult<size, inc, 24>(dst, r32.packet[3], resInc);
     }
-    i += extra; dst += extra*resInc;
+    i += extra;
+    dst += extra * resInc;
     if (size != 32) break;
   }
 }
 
-template<bool inc = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float *result, Index rows, bfloat16* dst, Index resInc = 1)
-{
+template <bool inc = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
   Index i = 0;
-  convertPointerF32toBF16VSX<32,inc>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16VSX<16,inc>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16VSX<8,inc>(i, result, rows, dst, resInc);
-  convertPointerF32toBF16VSX<1,inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<32, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<16, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<8, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<1, inc>(i, result, rows, dst, resInc);
 }
 
-template<typename RhsMapper, typename LhsMapper, typename = void>
+template <typename RhsMapper, typename LhsMapper, typename = void>
 struct UseStride : std::false_type {
-  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
-  {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
     using RhsSubMapper = typename RhsMapper::SubMapper;
 
     RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -738,11 +705,12 @@
   }
 };
 
-template<typename RhsMapper, typename LhsMapper>
-struct UseStride<RhsMapper, LhsMapper, std::enable_if_t<std::is_member_function_pointer<
-                           decltype(&RhsMapper::stride)>::value>> : std::true_type {
-  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
-  {
+template <typename RhsMapper, typename LhsMapper>
+struct UseStride<RhsMapper, LhsMapper,
+                 std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+    : std::true_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
     using RhsSubMapper = typename RhsMapper::SubMapper;
 
     RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -754,14 +722,9 @@
   }
 };
 
-template<typename LhsMapper, typename RhsMapper>
-void gemv_bfloat16_col(
-  Index rows, Index cols,
-  const LhsMapper& alhs,
-  const RhsMapper& rhs,
-  bfloat16* res, Index resIncr,
-  bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+void gemv_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+                       Index resIncr, bfloat16 alpha) {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr == 1);
 
@@ -781,8 +744,7 @@
 
   convertArrayPointerBF16toF32(result, 1, rows, res);
 
-  for (Index j2 = 0; j2 < cols; j2 += block_cols)
-  {
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
     Index jend = numext::mini(j2 + block_cols, cols);
 
     using LhsSubMapper = typename LhsMapper::SubMapper;
@@ -794,12 +756,11 @@
   convertArrayPointerF32toBF16VSX(result, rows, res);
 }
 
-template<Index num_acc, Index size>
-EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha)
-{
+template <Index num_acc, Index size>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha) {
   constexpr Index extra = num_acc & 3;
 
-  for(Index k = 0; k < num_acc; k += 4) {
+  for (Index k = 0; k < num_acc; k += 4) {
     Packet4f d0 = ploadu<Packet4f>(result + k);
     d0 = pmadd(acc[k + 0][0], pAlpha, d0);
 
@@ -809,15 +770,14 @@
       if (extra == 3) {
         pstoreu_partial(result + k, d0, extra);
       } else {
-        memcpy((void *)(result + k), (void *)(&d0), sizeof(float) * extra);
+        memcpy((void*)(result + k), (void*)(&d0), sizeof(float) * extra);
       }
     }
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f (&acc)[num_acc][2], Index k)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f (&acc)[num_acc][2], Index k) {
   if (num_acc > (k + 1)) {
     acc[k][1] = vec_mergel(acc[k + 0][0], acc[k + 1][0]);
     acc[k][0] = vec_mergeh(acc[k + 0][0], acc[k + 1][0]);
@@ -833,25 +793,24 @@
   }
 }
 
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f (&acc)[num_acc][2])
-{
-  for(Index k = 0; k < num_acc; k += 4) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f (&acc)[num_acc][2]) {
+  for (Index k = 0; k < num_acc; k += 4) {
     preduxVecResults2VSX<num_acc>(acc, k + 0);
     if (num_acc > (k + 2)) {
       preduxVecResults2VSX<num_acc>(acc, k + 2);
 #ifdef EIGEN_VECTORIZE_VSX
-      acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
 #else
-      acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0],acc[k + 2][0],p16uc_TRANSPOSE64_HI));
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0], acc[k + 2][0], p16uc_TRANSPOSE64_HI));
 #endif
     }
   }
 }
 
 #ifndef _ARCH_PWR9
-EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols)
-{
+EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols) {
   Packet16uc shift = pset1<Packet16uc>(8 * 2 * (8 - extra_cols));
 #ifdef _BIG_ENDIAN
   return reinterpret_cast<Packet8us>(vec_slo(vec_sro(reinterpret_cast<Packet16uc>(data), shift), shift));
@@ -861,9 +820,9 @@
 }
 #endif
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
-EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j, Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+                                        Index extra_cols) {
   Packet4f a0[num_acc][2], b0[2];
   Packet8bf a1, b1;
 
@@ -879,7 +838,7 @@
   b0[1] = oneConvertBF16Lo(b1.m_val);
 
   const LhsMapper lhs2 = lhs.getSubMapper(0, j);
-  for(Index k = 0; k < num_acc; k++) {
+  for (Index k = 0; k < num_acc; k++) {
     if (extra) {
       a1 = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
 #ifndef _ARCH_PWR9
@@ -895,11 +854,11 @@
   multVecVSX<num_acc, false>(acc, a0, b0);
 }
 
-template<Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2], Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2],
+                                    Index extra_cols) {
   Index j = 0;
-  for(; j + 8 <= cols; j += 8){
+  for (; j + 8 <= cols; j += 8) {
     multVSXVecLoop<num_acc, LhsMapper, RhsMapper, false>(acc, lhs, rhs, j, extra_cols);
   }
 
@@ -908,13 +867,13 @@
   }
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                       float* result) {
   constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
   const Index extra_cols = (cols & 7);
 
-  do{
+  do {
     Packet4f acc[num_acc][2];
 
     zeroAccumulators<num_acc, 2>(acc);
@@ -929,48 +888,48 @@
     outputVecResults<num_acc, 2>(acc, result, pAlpha);
 
     result += num_acc;
-  } while(multiIters && (num_acc <= rows - (row += num_acc)));
+  } while (multiIters && (num_acc <= rows - (row += num_acc)));
 }
 
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                 const Packet4f pAlpha, float* result) {
   if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
     colVSXVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
   }
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                const Packet4f pAlpha, float* result) {
   switch (rows - row) {
-  case 7:
-    colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 6:
-    colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 5:
-    colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 4:
-    colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 3:
-    colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 2:
-    colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
-  case 1:
-    colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
-    break;
+    case 7:
+      colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
   }
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                         float* result) {
   Index row = 0;
   if (rows >= MAX_BFLOAT16_VEC_ACC_VSX) {
     colVSXVecLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
@@ -979,14 +938,9 @@
   colVSXVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
 }
 
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_STRONG_INLINE void gemv_bfloat16_row(
-  Index rows, Index cols,
-  const LhsMapper& alhs,
-  const RhsMapper& rhs,
-  bfloat16* res, Index resIncr,
-  bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemv_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                           bfloat16* res, Index resIncr, bfloat16 alpha) {
   typedef typename RhsMapper::LinearMapper LinearMapper;
 
   // The following copy tells the compiler that lhs's attributes are not modified outside this function
@@ -1015,51 +969,65 @@
 
 #undef MAX_BFLOAT16_VEC_ACC_VSX
 
-const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb };
-const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 };
+const Packet16uc p16uc_COMPLEX32_XORFLIP = {0x44, 0x55, 0x66, 0x77, 0x00, 0x11, 0x22, 0x33,
+                                            0xcc, 0xdd, 0xee, 0xff, 0x88, 0x99, 0xaa, 0xbb};
+const Packet16uc p16uc_COMPLEX64_XORFLIP = {0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+                                            0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
 
 #ifdef _BIG_ENDIAN
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR  = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR  = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_NEGATE    = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_NEGATE    = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+                                             0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                             0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                              0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+                                           0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                           0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
 #else
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR  = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR  = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_NEGATE    = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX64_NEGATE    = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+                                              0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
+                                           0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
 #endif
 
 #ifdef _BIG_ENDIAN
-#define COMPLEX_DELTA  0
+#define COMPLEX_DELTA 0
 #else
-#define COMPLEX_DELTA  2
+#define COMPLEX_DELTA 2
 #endif
 
 /** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */
 EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) {
-    return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
 }
 
 EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) {
-    return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
 }
 
 /** \internal packet conjugate with real & imaginary operation inverted */
 EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) {
 #ifdef __POWER8_VECTOR__
-    return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
+  return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
 #else
-    return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
 #endif
 }
 
 EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) {
-    return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
 }
 
 #if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12)
@@ -1067,883 +1035,773 @@
 #endif
 
 /** \internal flip the real & imaginary results and packet conjugate */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) {
 #ifdef PERMXOR_GOOD
-    return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
 #else
-    return pcplxflip(pconj2(a));
+  return pcplxflip(pconj2(a));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) {
 #ifdef PERMXOR_GOOD
-    return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
 #else
-    return pcplxflip(pconj2(a));
+  return pcplxflip(pconj2(a));
 #endif
 }
 
 /** \internal packet conjugate and flip the real & imaginary results */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) {
 #ifdef PERMXOR_GOOD
-    return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
 #else
-    return pconj2(pcplxflip(a));
+  return pconj2(pcplxflip(a));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) {
 #ifdef PERMXOR_GOOD
-    return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
 #else
-    return pconj2(pcplxflip(a));
+  return pconj2(pcplxflip(a));
 #endif
 }
 
 /** \internal packet negate */
-EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) {
 #ifdef __POWER8_VECTOR__
-    return Packet2cf(vec_neg(a.v));
+  return Packet2cf(vec_neg(a.v));
 #else
-    return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) {
 #ifdef __POWER8_VECTOR__
-    return Packet1cd(vec_neg(a.v));
+  return Packet1cd(vec_neg(a.v));
 #else
-    return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
 #endif
 }
 
 /** \internal flip the real & imaginary results and negate */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) {
 #ifdef PERMXOR_GOOD
-    return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
 #else
-    return pcplxflip(pnegate2(a));
+  return pcplxflip(pnegate2(a));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) {
 #ifdef PERMXOR_GOOD
-    return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
 #else
-    return pcplxflip(pnegate2(a));
+  return pcplxflip(pnegate2(a));
 #endif
 }
 
 /** \internal flip the real & imaginary results */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a)
-{
-    return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) {
+  return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) {
 #ifdef EIGEN_VECTORIZE_VSX
-    return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
+  return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
 #else
-    return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
+  return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
 #endif
 }
 
 /** \internal load half a vector with one complex value */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src)
-{
-    Packet4f t;
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src) {
+  Packet4f t;
 #ifdef EIGEN_VECTORIZE_VSX
-    // Load float64/two float32 (doubleword alignment)
-    __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src));
+  // Load float64/two float32 (doubleword alignment)
+  __asm__("lxsdx %x0,%y1" : "=wa"(t) : "Z"(*src));
 #else
-    *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
+  *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
 #endif
-    return t;
+  return t;
 }
 
 /** \internal load two vectors from the real and imaginary portions of a complex value */
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i)
-{
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) {
 #ifdef _ARCH_PWR9
-    __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<float*>(src) + 0)));
-    __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<float*>(src) + 1)));
+  __asm__("lxvwsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<float*>(src) + 0)));
+  __asm__("lxvwsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<float*>(src) + 1)));
 #else
-    Packet4f t = pload_complex_half(src);
-    r = vec_splat(t, COMPLEX_DELTA + 0);
-    i = vec_splat(t, COMPLEX_DELTA + 1);
+  Packet4f t = pload_complex_half(src);
+  r = vec_splat(t, COMPLEX_DELTA + 0);
+  i = vec_splat(t, COMPLEX_DELTA + 1);
 #endif
 }
 
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i)
-{
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) {
 #ifdef EIGEN_VECTORIZE_VSX
-    __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
-    __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<double*>(src) + 1)));
+  __asm__("lxvdsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+  __asm__("lxvdsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<double*>(src) + 1)));
 #else
-    Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
-    r = vec_splat(t, 0);
-    i = vec_splat(t, 1);
+  Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
+  r = vec_splat(t, 0);
+  i = vec_splat(t, 1);
 #endif
 }
 
 #ifndef __POWER8_VECTOR__
-const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B };
+const Packet16uc p16uc_MERGEE = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                                 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B};
 
-const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+const Packet16uc p16uc_MERGEO = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                                 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F};
 #endif
 
 /** \internal load two vectors from the interleaved real & imaginary values of src */
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i)
-{
-    Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) {
+  Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
 #ifdef __POWER8_VECTOR__
-    r = vec_mergee(t, t);
-    i = vec_mergeo(t, t);
+  r = vec_mergee(t, t);
+  i = vec_mergeo(t, t);
 #else
-    r = vec_perm(t, t, p16uc_MERGEE);
-    i = vec_perm(t, t, p16uc_MERGEO);
+  r = vec_perm(t, t, p16uc_MERGEE);
+  i = vec_perm(t, t, p16uc_MERGEO);
 #endif
 }
 
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i)
-{
-    return pload_realimag(src, r, i);
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) {
+  return pload_realimag(src, r, i);
 }
 
 /** \internal load and splat a complex value into a vector - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src)
-{
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src) {
 #ifdef EIGEN_VECTORIZE_VSX
-    Packet4f ret;
-    __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
-    return ret;
+  Packet4f ret;
+  __asm__("lxvdsx %x0,%y1" : "=wa"(ret) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+  return ret;
 #else
-    return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
+  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
 #endif
 }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src)
-{
-    return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
 
 /** \internal load a complex value into a vector - row-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src)
-{
-    return ploadu<Packet2cf>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src)
-{
-    return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
 
 /** \internal load a scalar or a vector from complex location */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src)
-{
-    if (GEMV_IS_SCALAR) {
-        return pload_complex_half(src);
-    }
-    else
-    {
-        return ploadu<Packet4f>(reinterpret_cast<float*>(src));
-    }
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src) {
+  if (GEMV_IS_SCALAR) {
+    return pload_complex_half(src);
+  } else {
+    return ploadu<Packet4f>(reinterpret_cast<float*>(src));
+  }
 }
 
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src)
-{
-    return ploadu<Packet2d>(reinterpret_cast<double*>(src));
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src) {
+  return ploadu<Packet2d>(reinterpret_cast<double*>(src));
 }
 
 /** \internal load from a complex vector and convert to a real vector */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src)
-{
-    return src->v;
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) {
+  return src->v;
 }
 
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src)
-{
-    return src->v;
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) {
+  return src->v;
 }
 
 /** \internal load a full vector from complex location - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src)
-{
-    return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src) {
+  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
 }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src)
-{
-    return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
 
 /** \internal load a full vector from complex location - row-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src)
-{
-    return ploadu<Packet2cf>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src)
-{
-    return pload_complex_full(src);
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src) { return pload_complex_full(src); }
 
 /** \internal load a vector from a real-only scalar location - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src)
-{
-    return pset1<Packet4f>(*src);
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) { return pset1<Packet4f>(*src); }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src)
-{
-    return pset1<Packet2d>(*src);
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) { return pset1<Packet2d>(*src); }
 
-EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src)
-{
-    return src;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) { return src; }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src)
-{
-    return src;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) { return src; }
 
 /** \internal load a vector from a real-only vector location */
-EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src)
-{
-    Packet4f ret = ploadu<Packet4f>(src);
-    return vec_mergeh(ret, ret);
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) {
+  Packet4f ret = ploadu<Packet4f>(src);
+  return vec_mergeh(ret, ret);
 }
 
-EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src)
-{
-    return pload_real(src);
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) { return pload_real(src); }
+
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src) {
+  return pload_complex_full(src);  // Just for compilation
 }
 
-EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src)
-{
-    return pload_complex_full(src);   // Just for compilation
-}
-
-EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src)
-{
-    return pload_complex_full(src);   // Just for compilation
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src) {
+  return pload_complex_full(src);  // Just for compilation
 }
 
 /** \internal load a vector from a real-only scalar location - row-wise */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src)
-{
-    if (GEMV_IS_SCALAR) {
-        return pload_real_full(src);
-    }
-    else {
-        return ploadu<Packet4f>(src);
-    }
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) {
+  if (GEMV_IS_SCALAR) {
+    return pload_real_full(src);
+  } else {
+    return ploadu<Packet4f>(src);
+  }
 }
 
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src)
-{
-    return pload_real(src);
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) {
+  return pload_real(src);
 }
 
-EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b)
-{
-    EIGEN_UNUSED_VARIABLE(b);
-    return a;  // Just for compilation
+EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b) {
+  EIGEN_UNUSED_VARIABLE(b);
+  return a;  // Just for compilation
 }
 
-EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b)
-{
-    EIGEN_UNUSED_VARIABLE(b);
-    return a;  // Just for compilation
+EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b) {
+  EIGEN_UNUSED_VARIABLE(b);
+  return a;  // Just for compilation
 }
 
 /** \internal set a scalar from complex location */
-template<typename Scalar, typename ResScalar>
-EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj)
-{
-    return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
+template <typename Scalar, typename ResScalar>
+EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) {
+  return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
 }
 
 /** \internal set a vector from complex location */
-template<typename Scalar, typename ResScalar, typename ResPacket, int which>
-EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha)
-{
-    Packet2cf ret;
-    ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
-    ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
-    ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
-    ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
-    return ret;
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha) {
+  Packet2cf ret;
+  ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+  ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+  ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
+  ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
+  return ret;
 }
 
-template<typename Scalar, typename ResScalar, typename ResPacket, int which>
-EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha)
-{
-    Packet1cd ret;
-    ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
-    ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
-    return ret;
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha) {
+  Packet1cd ret;
+  ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+  ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+  return ret;
 }
 
 /** \internal zero out a vector for real or complex forms */
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet pset_zero()
-{
-    return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pset_zero() {
+  return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
 }
 
-template<>
-EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>()
-{
-    return Packet2cf(pset1<Packet4f>(float(0)));
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>() {
+  return Packet2cf(pset1<Packet4f>(float(0)));
 }
 
-template<>
-EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>()
-{
-    return Packet1cd(pset1<Packet2d>(double(0)));
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>() {
+  return Packet1cd(pset1<Packet2d>(double(0)));
 }
 
 /** \internal initialize a vector from another vector */
-template<typename Packet, typename LhsPacket, typename RhsPacket>
-EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1)
-{
-    if (GEMV_IS_COMPLEX_COMPLEX) {
-        EIGEN_UNUSED_VARIABLE(c1);
-        return pset_zero<Packet>();
-    }
-    else
-    {
-        return c1;  // Intentionally left uninitialized
-    }
+template <typename Packet, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) {
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    EIGEN_UNUSED_VARIABLE(c1);
+    return pset_zero<Packet>();
+  } else {
+    return c1;  // Intentionally left uninitialized
+  }
 }
 
-template<typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
-struct alpha_store
-{
-    alpha_store(ResScalar& alpha) {
-        separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
-        separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
-    }
-    struct ri {
-        PResPacket r;
-        PResPacket i;
-    } separate;
+template <typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
+struct alpha_store {
+  alpha_store(ResScalar& alpha) {
+    separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
+    separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
+  }
+  struct ri {
+    PResPacket r;
+    PResPacket i;
+  } separate;
 };
 
 /** \internal multiply and add for complex math */
-template<typename ScalarPacket, typename AlphaData>
-EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0)
-{
-    return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
+template <typename ScalarPacket, typename AlphaData>
+EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) {
+  return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
 }
 
 /** \internal store and madd for complex math */
-template<typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData>
-EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res)
-{
-    PResPacket c2 = pcplxflipconj(c0);
-    if (GEMV_IS_SCALAR) {
-        ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
-        ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
-        pstoreu(reinterpret_cast<Scalar*>(res), c3);
-    } else {
-        ScalarPacket c4 = pload_complex<ResPacket>(res);
-        PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
-        pstoreu(res, c3);
-    }
+template <typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar,
+          typename AlphaData>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) {
+  PResPacket c2 = pcplxflipconj(c0);
+  if (GEMV_IS_SCALAR) {
+    ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
+    ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
+    pstoreu(reinterpret_cast<Scalar*>(res), c3);
+  } else {
+    ScalarPacket c4 = pload_complex<ResPacket>(res);
+    PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+    pstoreu(res, c3);
+  }
 }
 
-template<typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData, Index ResPacketSize, Index iter2>
-EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res)
-{
-    PResPacket c2 = pcplxflipconj(c0);
-    PResPacket c3 = pcplxflipconj(c1);
+template <typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData,
+          Index ResPacketSize, Index iter2>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) {
+  PResPacket c2 = pcplxflipconj(c0);
+  PResPacket c3 = pcplxflipconj(c1);
 #if !defined(_ARCH_PWR10)
-    ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
-    ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
-    PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
-    PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
-    pstoreu(res + (iter2 * ResPacketSize), c6);
-    pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
+  ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
+  ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
+  PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+  PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
+  pstoreu(res + (iter2 * ResPacketSize), c6);
+  pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
 #else
-    __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize));
+  __vector_pair a = *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize));
 #if EIGEN_COMP_LLVM
-    PResPacket c6[2];
-    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
-    c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
-    c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
-    GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
+  PResPacket c6[2];
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
+  c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
+  c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
+  GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
 #else
-    if (GEMV_IS_COMPLEX_FLOAT) {
-        __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
-        __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
-    } else {
-        __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
-        __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
-    }
+  if (GEMV_IS_COMPLEX_FLOAT) {
+    __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+    __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+  } else {
+    __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+    __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+  }
 #endif
-    *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a;
+  *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize)) = a;
 #endif
 }
 
 /** \internal load lhs packet */
-template<typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
-EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j)
-{
-    if (sizeof(Scalar) == sizeof(LhsScalar)) {
-        const LhsScalar& src = lhs(i + 0, j);
-        return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
-    }
-    return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
+template <typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
+EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) {
+  if (sizeof(Scalar) == sizeof(LhsScalar)) {
+    const LhsScalar& src = lhs(i + 0, j);
+    return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
+  }
+  return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
 }
 
 /** \internal madd for complex times complex */
-template<typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c)
-{
-    if (ConjugateLhs && ConjugateRhs) {
-        return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
-    }
-    else if (Negate && !ConjugateLhs && ConjugateRhs) {
-        return vec_nmsub(a, b, c);
-    }
-    else {
-        return vec_madd(a, b, c);
-    }
+template <typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return vec_nmsub(a, b, c);
+  } else {
+    return vec_madd(a, b, c);
+  }
 }
 
 /** \internal madd for complex times real */
-template<typename ComplexPacket, typename RealPacket, bool Conjugate>
-EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c)
-{
-    if (Conjugate) {
-        return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
-    }
-    else {
-        return vec_madd(a, b, c);
-    }
+template <typename ComplexPacket, typename RealPacket, bool Conjugate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) {
+  if (Conjugate) {
+    return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+  } else {
+    return vec_madd(a, b, c);
+  }
 }
 
-template<typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
-    conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
-    RhsPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pset1<RhsPacket>(*b);
-    }
-    else {
-        b0 = ploadu<RhsPacket>(b);
-    }
-    c0 = pcj.pmadd(a0, b0, c0);
+template <typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs,
+          bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  RhsPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pset1<RhsPacket>(*b);
+  } else {
+    b0 = ploadu<RhsPacket>(b);
+  }
+  c0 = pcj.pmadd(a0, b0, c0);
 }
 
 /** \internal core multiply operation for vectors - complex times complex */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1)
-{
-    ScalarPacket br, bi;
-    if (StorageOrder == ColMajor) {
-        pload_realimag<RhsScalar>(b, br, bi);
-    }
-    else {
-        pload_realimag_row<RhsScalar>(b, br, bi);
-    }
-    if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
-    LhsPacket a1 = pcplxflipconj(a0);
-    ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
-    ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
-    c1 = ResPacket(ci);
-    c0 = PResPacket(cr);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) {
+  ScalarPacket br, bi;
+  if (StorageOrder == ColMajor) {
+    pload_realimag<RhsScalar>(b, br, bi);
+  } else {
+    pload_realimag_row<RhsScalar>(b, br, bi);
+  }
+  if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
+  LhsPacket a1 = pcplxflipconj(a0);
+  ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
+  ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
+  c1 = ResPacket(ci);
+  c0 = PResPacket(cr);
 }
 
 /** \internal core multiply operation for vectors - real times complex */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
-    ScalarPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pload_complex_full(b);
-    }
-    else {
-        b0 = pload_complex_full_row(b);
-    }
-    ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
-    c0 = PResPacket(cri);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_complex_full(b);
+  } else {
+    b0 = pload_complex_full_row(b);
+  }
+  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
+  c0 = PResPacket(cri);
 }
 
 /** \internal core multiply operation for vectors - complex times real */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
-    ScalarPacket a1 = pload_complex<ResPacket>(&a0);
-    ScalarPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pload_real(b);
-    }
-    else {
-        b0 = pload_real_row<ResPacket>(b);
-    }
-    ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
-    c0 = PResPacket(cri);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  ScalarPacket a1 = pload_complex<ResPacket>(&a0);
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_real(b);
+  } else {
+    b0 = pload_real_row<ResPacket>(b);
+  }
+  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
+  c0 = PResPacket(cri);
 }
 
-#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \
-{ \
-    gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0, c1); \
-}
+#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType)                                                        \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) {                   \
+    gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,   \
+                              ConjugateRhs, StorageOrder>(a0, b, c0, c1);                                           \
+  }
 
-GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>,  Packet2cf)
+GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
 GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex<double>, Packet1cd)
 
-#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \
-{ \
-    gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType)                                                           \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) {                      \
+    gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,      \
+                           ConjugateRhs, StorageOrder>(a0, b, c0);                                                  \
+  }
 
-GEMV_MULT_REAL_COMPLEX(float,    std::complex<float>,  Packet2cf)
-GEMV_MULT_REAL_COMPLEX(double,   std::complex<double>, Packet1cd)
-GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>,  Packet2cf)
+GEMV_MULT_REAL_COMPLEX(float, std::complex<float>, Packet2cf)
+GEMV_MULT_REAL_COMPLEX(double, std::complex<double>, Packet1cd)
+GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
 GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex<double>, Packet1cd)
 
-#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \
-{ \
-    gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2)                                                \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) {                    \
+    gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,      \
+                           ConjugateRhs, StorageOrder>(a0, b, c0);                                                  \
+  }
 
-GEMV_MULT_COMPLEX_REAL(Packet2cf,             float, Packet2cf, std::complex<float>)
-GEMV_MULT_COMPLEX_REAL(Packet1cd,            double, Packet1cd, std::complex<double>)
-GEMV_MULT_COMPLEX_REAL(std::complex<float>,   float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex<double>)
+GEMV_MULT_COMPLEX_REAL(std::complex<float>, float, Packet2cf, std::complex<float>)
 GEMV_MULT_COMPLEX_REAL(std::complex<double>, double, Packet1cd, std::complex<double>)
 
 #ifdef USE_GEMV_MMA
 /** \internal convert packet to real form */
-template<typename T>
-EIGEN_ALWAYS_INLINE T convertReal(T a)
-{
-    return a;
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertReal(T a) {
+  return a;
 }
 
-EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a)
-{
-    return a.v;
-}
+EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) { return a.v; }
 
-EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a)
-{
-    return a.v;
-}
+EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) { return a.v; }
 
 /** \internal convert packet to complex form */
-template<typename T>
-EIGEN_ALWAYS_INLINE T convertComplex(T a)
-{
-    return a;
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertComplex(T a) {
+  return a;
 }
 
-EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a)
-{
-    return Packet2cf(a);
-}
+EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) { return Packet2cf(a); }
 
-EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a)
-{
-    return Packet1cd(a);
-}
+EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) { return Packet1cd(a); }
 
 /** \internal load a vector from a complex location (for MMA version) */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
-EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a)
-{
-    a = SLhsPacket(pload_complex<ResPacket>(&a));
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) {
+  a = SLhsPacket(pload_complex<ResPacket>(&a));
 }
 
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
-EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&)
-{
-    // Pass thru
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) {
+  // Pass thru
 }
 
 /** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b)
-{
-    if (NegativeAccumulate)
-    {
-        __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
-    }
-    else {
-        __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
-    }
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
 }
 
 /** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b)
-{
-    if (NegativeAccumulate)
-    {
-        __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
-    }
-    else {
-        __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
-    }
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
 }
 
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&)
-{
-    // Just for compilation
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) {
+  // Just for compilation
 }
 
 /** \internal madd for complex times complex (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
-{
-    if (ConjugateLhs && ConjugateRhs) {
-        RealPacket b2 = pconj2(convertComplex(b)).v;
-        return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
-    }
-    else if (Negate && !ConjugateLhs && ConjugateRhs) {
-        return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
-    }
-    else {
-        return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
-    }
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
+  } else {
+    return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
+  }
 }
 
-template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
-{
-    if (ConjugateLhs && ConjugateRhs) {
-        RealPacket b2 = pconj2(convertComplex(b)).v;
-        return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
-    }
-    else if (Negate && !ConjugateLhs && ConjugateRhs) {
-        return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
-    }
-    else {
-        return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
-    }
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
+  } else {
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+  }
 }
 
 /** \internal madd for complex times real (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
-EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
-{
-    RealPacket a2 = convertReal(a);
-    if (Conjugate) {
-        RealPacket b2 = pconj2(convertComplex(b)).v;
-        if (StorageOrder == ColMajor) {
-            return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
-        } else {
-            return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
-        }
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+  RealPacket a2 = convertReal(a);
+  if (Conjugate) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    if (StorageOrder == ColMajor) {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
+    } else {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
     }
-    else {
-        if (StorageOrder == ColMajor) {
-            return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
-        } else {
-            return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
-        }
+  } else {
+    if (StorageOrder == ColMajor) {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
+    } else {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
     }
+  }
 }
 
 /** \internal madd for real times complex (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
-EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
-{
-    if (Conjugate) {
-        RealPacket b2 = pconj2(convertComplex(b)).v;
-        return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
-    }
-    else {
-        return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
-    }
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+  if (Conjugate) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+  } else {
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+  }
 }
 
 /** \internal core multiply operation for vectors (MMA version) - complex times complex */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
-    ScalarPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pload_realimag_combine(b);
-    } else {
-        b0 = pload_realimag_combine_row(b);
-    }
-    pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_realimag_combine(b);
+  } else {
+    b0 = pload_realimag_combine_row(b);
+  }
+  pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
 }
 
 /** \internal core multiply operation for vectors (MMA version) - complex times real */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
-    pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
-    ScalarPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pload_real(b);
-    }
-    else {
-        b0 = pload_real_row<ResPacket>(b);
-    }
-    pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_real(b);
+  } else {
+    b0 = pload_real_row<ResPacket>(b);
+  }
+  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
 }
 
 /** \internal core multiply operation for vectors (MMA version) - real times complex */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
-    ScalarPacket b0;
-    if (StorageOrder == ColMajor) {
-        b0 = pload_complex_full(b);
-    }
-    else {
-        b0 = pload_complex_full_row(b);
-    }
-    pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs, (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_complex_full(b);
+  } else {
+    b0 = pload_complex_full_row(b);
+  }
+  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs,
+                         (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
 }
 
-#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
-    gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType)                                                             \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>         \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                      \
+    gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs,          \
+                                  ConjugateRhs, StorageOrder>(a0, b, c0);                                           \
+  }
 
-GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf,     std::complex<float>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
 GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex<float>)
-GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd,     std::complex<double>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
 
 /** \internal core multiply operation for vectors (MMA version) - complex times complex */
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0)
-{
-    if (sizeof(LhsScalar) == 16) {
-        gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
-    }
-    else {
-        gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
-    }
+template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,
+          typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0) {
+  if (sizeof(LhsScalar) == 16) {
+    gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+                                  StorageOrder>(a0, b, c0);
+  } else {
+    gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+                               StorageOrder>(a0, b, c0);
+  }
 }
 
-#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
-    gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType)                                                                  \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,   \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>           \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                        \
+    gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+                               StorageOrder>(a0, b, c0);                                                              \
+  }
 
 GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex<float>)
 GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex<double>)
 
-#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
-    gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType)                                                                  \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,   \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>           \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                        \
+    gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+                               StorageOrder>(a0, b, c0);                                                              \
+  }
 
-GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf,     float)
-GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd,     double)
+GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float)
+GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double)
 GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float)
 GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double)
 
 /** \internal disassemble MMA accumulator results into packets */
-template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
-    __builtin_mma_disassemble_acc(&result0.packet, c0);
-    if (sizeof(LhsPacket) == 16) {
-        if (sizeof(RhsPacket) == 16) {
-            ScalarPacket tmp0, tmp2;
-            tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
-            tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
-            result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
-            result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
-            result0.packet[2] = tmp2;
-            result0.packet[0] = tmp0;
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  __builtin_mma_disassemble_acc(&result0.packet, c0);
+  if (sizeof(LhsPacket) == 16) {
+    if (sizeof(RhsPacket) == 16) {
+      ScalarPacket tmp0, tmp2;
+      tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
+      tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
+      result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
+      result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
+      result0.packet[2] = tmp2;
+      result0.packet[0] = tmp0;
 
-            if (ConjugateLhs) {
-                result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-                result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
-            } else if (ConjugateRhs) {
-                result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
-                result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
-            } else {
-                result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
-                result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
-            }
-            result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
-            result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
-        } else {
-            result0.packet[0][1] = result0.packet[1][1];
-            result0.packet[2][1] = result0.packet[3][1];
-        }
+      if (ConjugateLhs) {
+        result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+        result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
+      } else if (ConjugateRhs) {
+        result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+        result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
+      } else {
+        result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+        result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+      }
+      result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+      result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
+    } else {
+      result0.packet[0][1] = result0.packet[1][1];
+      result0.packet[2][1] = result0.packet[3][1];
     }
+  }
 }
 
-template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
-    __builtin_mma_disassemble_acc(&result0.packet, c0);
-    if (GEMV_IS_COMPLEX_COMPLEX) {
-        if (ConjugateLhs) {
-            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-            result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
-        } else {
-            if (ConjugateRhs) {
-                result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
-            } else {
-                result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
-            }
-        }
-        result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
-    } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
-        if (ConjugateLhs) {
-            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-        }
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  __builtin_mma_disassemble_acc(&result0.packet, c0);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
     } else {
-        result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
+      if (ConjugateRhs) {
+        result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
+      } else {
+        result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
+      }
     }
+    result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+  } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+    }
+  } else {
+    result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
+  }
 }
 
-template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
-    if (!GEMV_IS_COMPLEX_FLOAT) {
-        disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
-    } else {
-        disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
-    }
+template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  if (!GEMV_IS_COMPLEX_FLOAT) {
+    disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+  } else {
+    disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+  }
 }
 #endif
 
@@ -1952,194 +1810,207 @@
 #define GEMV_LOADPACKET_COL_COMPLEX(iter) \
   loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
 
-#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \
-  convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
+#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
 
 #ifdef USE_GEMV_MMA
 #define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    __builtin_mma_xxsetaccz(&e0##iter); \
+  if (GEMV_GETN_COMPLEX(N) > iter) {       \
+    __builtin_mma_xxsetaccz(&e0##iter);    \
   }
 
 #if EIGEN_COMP_LLVM
-#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
-  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2)                     \
+  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), \
+                     GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));    \
   EIGEN_UNUSED_VARIABLE(f##iter1);
 #else
-#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
-  if (sizeof(LhsPacket) == 16) { \
-    const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \
-    a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src)); \
-    EIGEN_UNUSED_VARIABLE(f##iter1); \
-  } else { \
-    f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j); \
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2)                                                         \
+  if (sizeof(LhsPacket) == 16) {                                                                            \
+    const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j);                                  \
+    a##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src));                             \
+    EIGEN_UNUSED_VARIABLE(f##iter1);                                                                        \
+  } else {                                                                                                  \
+    f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j);                  \
     GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \
   }
 #endif
 
-#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
-      EIGEN_UNUSED_VARIABLE(a##iter); \
-    } else { \
+#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N)          \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                 \
+    if (GEMV_IS_COMPLEX_FLOAT) {                     \
+      f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter);   \
+      EIGEN_UNUSED_VARIABLE(a##iter);                \
+    } else {                                         \
       GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(a##iter); \
-    EIGEN_UNUSED_VARIABLE(f##iter); \
+    }                                                \
+  } else {                                           \
+    EIGEN_UNUSED_VARIABLE(a##iter);                  \
+    EIGEN_UNUSED_VARIABLE(f##iter);                  \
   }
 
-#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter); \
-    } else { \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter); \
-    } \
+#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N)                                                                      \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter);                        \
+    } else {                                                                                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter);                        \
+    }                                                                                                            \
   }
 
 #define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
   GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
 
 #define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter1) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \
-      EIGEN_UNUSED_VARIABLE(a##iter3) \
-    } else { \
-      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \
-      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(a##iter2); \
-    EIGEN_UNUSED_VARIABLE(a##iter3); \
-  } \
-  EIGEN_UNUSED_VARIABLE(f##iter2); \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                      \
+    if (GEMV_IS_COMPLEX_FLOAT) {                           \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2);        \
+      EIGEN_UNUSED_VARIABLE(a##iter3)                      \
+    } else {                                               \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1);   \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1);   \
+    }                                                      \
+  } else {                                                 \
+    EIGEN_UNUSED_VARIABLE(a##iter2);                       \
+    EIGEN_UNUSED_VARIABLE(a##iter3);                       \
+  }                                                        \
+  EIGEN_UNUSED_VARIABLE(f##iter2);                         \
   EIGEN_UNUSED_VARIABLE(f##iter3);
 
-#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter1) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      PLhsPacket g[2]; \
-      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2); \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2); \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3); \
-    } else { \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2); \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3); \
-    } \
+#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N)                                                       \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                                                                            \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      PLhsPacket g[2];                                                                                           \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2);                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2);                          \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3);                          \
+    } else {                                                                                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2);                      \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3);                      \
+    }                                                                                                            \
   }
 
 #if EIGEN_COMP_LLVM
-#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
-  if (GEMV_GETN_COMPLEX(N) > 1) { \
+#define GEMV_LOAD_COL_COMPLEX_MMA(N)                       \
+  if (GEMV_GETN_COMPLEX(N) > 1) {                          \
     GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \
-  } else { \
-    GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \
+  } else {                                                 \
+    GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)             \
   }
 
-#define GEMV_WORK_COL_COMPLEX_MMA(N) \
-  if (GEMV_GETN_COMPLEX(N) > 1) { \
+#define GEMV_WORK_COL_COMPLEX_MMA(N)                       \
+  if (GEMV_GETN_COMPLEX(N) > 1) {                          \
     GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \
-  } else { \
-    GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \
+  } else {                                                 \
+    GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)             \
   }
 #else
-#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
-  GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
+#define GEMV_LOAD_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
 
-#define GEMV_WORK_COL_COMPLEX_MMA(N) \
-  GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
+#define GEMV_WORK_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
 #endif
 
-#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
-  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter);
+#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter)                                                                   \
+  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>( \
+      &e0##iter, result0##iter);
 
-#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \
-    c0##iter = PResPacket(result0##iter.packet[0]); \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
-    } else { \
-      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \
-      c0##iter = PResPacket(result0##iter.packet[2]); \
-      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \
-    } \
+#define GEMV_STORE_COL_COMPLEX_MMA(iter, N)                                                     \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                            \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter);                                                         \
+    c0##iter = PResPacket(result0##iter.packet[0]);                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + (iter * ResPacketSize));                              \
+    } else {                                                                                    \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize));                       \
+      c0##iter = PResPacket(result0##iter.packet[2]);                                           \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize));                 \
+    }                                                                                           \
   }
 
-#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter1) { \
-    GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \
-    GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \
-    c0##iter2 = PResPacket(result0##iter2.packet[0]); \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      c0##iter3 = PResPacket(result0##iter3.packet[0]); \
-      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
-    } else { \
-      c0##iter3 = PResPacket(result0##iter2.packet[2]); \
-      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
-      c0##iter2 = PResPacket(result0##iter3.packet[0]); \
-      c0##iter3 = PResPacket(result0##iter3.packet[2]); \
-      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
-    } \
+#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N)                                                        \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                                                                              \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter2);                                                                           \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter3);                                                                           \
+    c0##iter2 = PResPacket(result0##iter2.packet[0]);                                                              \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                   \
+      c0##iter3 = PResPacket(result0##iter3.packet[0]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(      \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+    } else {                                                                                                       \
+      c0##iter3 = PResPacket(result0##iter2.packet[2]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>( \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+      c0##iter2 = PResPacket(result0##iter3.packet[0]);                                                            \
+      c0##iter3 = PResPacket(result0##iter3.packet[2]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>( \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+    }                                                                                                              \
   }
 
-#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
-  GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \
-  Index j = j2; \
-  do { \
-    const RhsScalar& b1 = rhs2(j, 0); \
-    RhsScalar* b = const_cast<RhsScalar *>(&b1); \
-    GEMV_UNROLL(GEMV_PREFETCH, N) \
-    GEMV_LOAD_COL_COMPLEX_MMA(N) \
-    GEMV_WORK_COL_COMPLEX_MMA(N) \
-  } while (++j < jend); \
-  if (GEMV_GETN(N) <= 2) { \
-    GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \
-  } else { \
+#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)                 \
+  GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N)                 \
+  Index j = j2;                                             \
+  do {                                                      \
+    const RhsScalar& b1 = rhs2(j, 0);                       \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1);             \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                           \
+    GEMV_LOAD_COL_COMPLEX_MMA(N)                            \
+    GEMV_WORK_COL_COMPLEX_MMA(N)                            \
+  } while (++j < jend);                                     \
+  if (GEMV_GETN(N) <= 2) {                                  \
+    GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N)              \
+  } else {                                                  \
     GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \
-  } \
+  }                                                         \
   i += (ResPacketSize * N);
 #endif
 
-#define GEMV_INIT_COMPLEX(iter, N) \
-  if (N > iter) { \
-    c0##iter = pset_zero<PResPacket>(); \
+#define GEMV_INIT_COMPLEX(iter, N)                                   \
+  if (N > iter) {                                                    \
+    c0##iter = pset_zero<PResPacket>();                              \
     c1##iter = pset_init<ResPacket, LhsPacket, RhsPacket>(c1##iter); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(c0##iter); \
-    EIGEN_UNUSED_VARIABLE(c1##iter); \
+  } else {                                                           \
+    EIGEN_UNUSED_VARIABLE(c0##iter);                                 \
+    EIGEN_UNUSED_VARIABLE(c1##iter);                                 \
   }
 
-#define GEMV_WORK_COL_COMPLEX(iter, N) \
-  if (N > iter) { \
-    f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
-    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(f##iter); \
+#define GEMV_WORK_COL_COMPLEX(iter, N)                                                                     \
+  if (N > iter) {                                                                                          \
+    f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter);                                                           \
+    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+                      ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter);                             \
+  } else {                                                                                                 \
+    EIGEN_UNUSED_VARIABLE(f##iter);                                                                        \
   }
 
-#define GEMV_STORE_COL_COMPLEX(iter, N) \
-  if (N > iter) { \
-    if (GEMV_IS_COMPLEX_COMPLEX) { \
-      c0##iter = padd(c0##iter, c1##iter); \
-    } \
-    pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
+#define GEMV_STORE_COL_COMPLEX(iter, N)                                                       \
+  if (N > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_COMPLEX) {                                                            \
+      c0##iter = padd(c0##iter, c1##iter);                                                    \
+    }                                                                                         \
+    pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+        c0##iter, alpha_data, res + i + (iter * ResPacketSize));                              \
   }
 
 /** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */
-#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \
-  GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \
-  Index j = j2; \
-  do { \
-    const RhsScalar& b1 = rhs2(j, 0); \
-    RhsScalar* b = const_cast<RhsScalar *>(&b1); \
-    GEMV_UNROLL(GEMV_PREFETCH, N) \
-    GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \
-  } while (++j < jend); \
-  GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \
+#define GEMV_PROCESS_COL_COMPLEX_ONE(N)         \
+  GEMV_UNROLL(GEMV_INIT_COMPLEX, N)             \
+  Index j = j2;                                 \
+  do {                                          \
+    const RhsScalar& b1 = rhs2(j, 0);           \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1); \
+    GEMV_UNROLL(GEMV_PREFETCH, N)               \
+    GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N)       \
+  } while (++j < jend);                         \
+  GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N)        \
   i += (ResPacketSize * N);
 
 #if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA))
@@ -2147,465 +2018,440 @@
 #endif
 
 #ifdef USE_GEMV_COL_COMPLEX_MMA
-#define GEMV_PROCESS_COL_COMPLEX(N) \
-  GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
 #else
 #if defined(USE_GEMV_MMA) && (__GNUC__ > 10)
-#define GEMV_PROCESS_COL_COMPLEX(N) \
+#define GEMV_PROCESS_COL_COMPLEX(N)          \
   if (sizeof(Scalar) != sizeof(LhsPacket)) { \
-    GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
-  } else { \
-    GEMV_PROCESS_COL_COMPLEX_ONE(N) \
+    GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)      \
+  } else {                                   \
+    GEMV_PROCESS_COL_COMPLEX_ONE(N)          \
   }
 #else
-#define GEMV_PROCESS_COL_COMPLEX(N) \
-  GEMV_PROCESS_COL_COMPLEX_ONE(N)
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE(N)
 #endif
 #endif
 
-template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_complex_col(
-    Index rows, Index cols,
-    const LhsMapper& alhs,
-    const RhsMapper& rhs,
-    ResScalar* res, Index resIncr,
-    ResScalar alpha)
-{
-    typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                          ResScalar* res, Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
 
-    typedef typename Traits::LhsPacket LhsPacket;
-    typedef typename Traits::RhsPacket RhsPacket;
-    typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-    typedef typename packet_traits<Scalar>::type ScalarPacket;
-    typedef typename packet_traits<LhsScalar>::type PLhsPacket;
-    typedef typename packet_traits<ResScalar>::type PResPacket;
-    typedef gemv_traits<ResPacket, ResPacket> PTraits;
+  typedef typename packet_traits<Scalar>::type ScalarPacket;
+  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+  typedef typename packet_traits<ResScalar>::type PResPacket;
+  typedef gemv_traits<ResPacket, ResPacket> PTraits;
 
-    EIGEN_UNUSED_VARIABLE(resIncr);
-    eigen_internal_assert(resIncr == 1);
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
 
-    // The following copy tells the compiler that lhs's attributes are not modified outside this function
-    // This helps GCC to generate proper code.
-    LhsMapper lhs(alhs);
-    RhsMapper rhs2(rhs);
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
 
-    conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
 
-    const Index lhsStride = lhs.stride();
-    // TODO: for padded aligned inputs, we could enable aligned reads
-    enum {
-        LhsAlignment = Unaligned,
-        ResPacketSize = PTraits::ResPacketSize,
-        LhsPacketSize = PTraits::LhsPacketSize,
-        RhsPacketSize = PTraits::RhsPacketSize,
-    };
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = PTraits::ResPacketSize,
+    LhsPacketSize = PTraits::LhsPacketSize,
+    RhsPacketSize = PTraits::RhsPacketSize,
+  };
 #ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-    const Index prefetch_dist = 64 * LhsPacketSize;
+  const Index prefetch_dist = 64 * LhsPacketSize;
 #endif
 
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    const Index n8 = rows - 8 * ResPacketSize + 1;
-    const Index n4 = rows - 4 * ResPacketSize + 1;
-    const Index n2 = rows - 2 * ResPacketSize + 1;
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
 #endif
-    const Index n1 = rows - 1 * ResPacketSize + 1;
+  const Index n1 = rows - 1 * ResPacketSize + 1;
 
-    // TODO: improve the following heuristic:
-    const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
 
-    typedef alpha_store<PResPacket, ResPacket, ResScalar, Scalar> AlphaData;
-    AlphaData alpha_data(alpha);
+  typedef alpha_store<PResPacket, ResPacket, ResScalar, Scalar> AlphaData;
+  AlphaData alpha_data(alpha);
 
-    for (Index j2 = 0; j2 < cols; j2 += block_cols)
-    {
-        Index jend = numext::mini(j2 + block_cols, cols);
-        Index i = 0;
-        PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
-        ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
-        PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+    ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+    PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
 #ifdef USE_GEMV_MMA
-        __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
-        __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
-        PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
-        GEMV_UNUSED(8, e0)
-        GEMV_UNUSED(8, result0)
-        GEMV_UNUSED(8, a)
-        GEMV_UNUSED(8, f)
+    __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+    __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
+    PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
+    GEMV_UNUSED(8, e0)
+    GEMV_UNUSED(8, result0)
+    GEMV_UNUSED(8, a)
+    GEMV_UNUSED(8, f)
 #if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA)
-        if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
+    if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
 #endif
 #endif
 #ifndef GCC_ONE_VECTORPAIR_BUG
-        {
-            while (i < n8)
-            {
-                GEMV_PROCESS_COL_COMPLEX(8)
-            }
-        }
-        while (i < n4)
-        {
-            GEMV_PROCESS_COL_COMPLEX(4)
-        }
-        if (i < n2)
-        {
-            GEMV_PROCESS_COL_COMPLEX(2)
-        }
-        if (i < n1)
-#else
-        while (i < n1)
-#endif
-        {
-            GEMV_PROCESS_COL_COMPLEX_ONE(1)
-        }
-        for (;i < rows;++i)
-        {
-            ResScalar d0(0);
-            Index j = j2;
-            do {
-                d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
-            } while (++j < jend);
-            res[i] += alpha * d0;
-        }
+    {
+      while (i < n8) {
+        GEMV_PROCESS_COL_COMPLEX(8)
+      }
     }
+    while (i < n4) {
+      GEMV_PROCESS_COL_COMPLEX(4)
+    }
+    if (i < n2) {
+      GEMV_PROCESS_COL_COMPLEX(2)
+    }
+    if (i < n1)
+#else
+    while (i < n1)
+#endif
+    {
+      GEMV_PROCESS_COL_COMPLEX_ONE(1)
+    }
+    for (; i < rows; ++i) {
+      ResScalar d0(0);
+      Index j = j2;
+      do {
+        d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+      } while (++j < jend);
+      res[i] += alpha * d0;
+    }
+  }
 }
 
-template <typename Scalar, int N> struct ScalarBlock {
-    Scalar scalar[N];
+template <typename Scalar, int N>
+struct ScalarBlock {
+  Scalar scalar[N];
 };
 
 #ifdef USE_GEMV_MMA
-static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
+static Packet16uc p16uc_ELEMENT_3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+                                     0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
 
 /** \internal predux (add elements of a vector) from a MMA accumulator - real results */
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1)
-{
-    PacketBlock<ResPacket, 4> result0, result1;
-    __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    __builtin_mma_disassemble_acc(&result1.packet, acc1);
-    result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
-    result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
-    result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
-    result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
-    result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
-    return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<ResPacket, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
+  result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
+  result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
+  result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
+  result0.packet[0] =
+      vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
+  return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
 }
 
-template<>
-EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1)
-{
-    PacketBlock<Packet2d, 4> result0, result1;
-    __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    __builtin_mma_disassemble_acc(&result1.packet, acc1);
-    result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
-    return *reinterpret_cast<ScalarBlock<double, 2> *>(&result0.packet[0]);
+template <>
+EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<Packet2d, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  result0.packet[0] =
+      vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
+  return *reinterpret_cast<ScalarBlock<double, 2>*>(&result0.packet[0]);
 }
 
 /** \internal add complex results together */
-template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0, PacketBlock<Packet4f, 4>& result1)
-{
-    ScalarBlock<std::complex<float>, 2> cc0;
-    result0.packet[0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
-    result0.packet[2] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
-    result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
-    if (GEMV_IS_COMPLEX_COMPLEX) {
-        result0.packet[1] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
-        result0.packet[3] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
-        result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
-        if (ConjugateLhs) {
-            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-            result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
-        } else if (ConjugateRhs) {
-            result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
-        } else {
-            result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
-        }
-        result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0,
+                                                                          PacketBlock<Packet4f, 4>& result1) {
+  ScalarBlock<std::complex<float>, 2> cc0;
+  result0.packet[0] = reinterpret_cast<Packet4f>(
+      vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
+  result0.packet[2] = reinterpret_cast<Packet4f>(
+      vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
+  result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    result0.packet[1] = reinterpret_cast<Packet4f>(
+        vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
+    result0.packet[3] = reinterpret_cast<Packet4f>(
+        vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
+    result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
+    } else if (ConjugateRhs) {
+      result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
     } else {
-        if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
-            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-        }
+      result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
     }
-    cc0.scalar[0].real(result0.packet[0][0]);
-    cc0.scalar[0].imag(result0.packet[0][1]);
-    cc0.scalar[1].real(result0.packet[0][2]);
-    cc0.scalar[1].imag(result0.packet[0][3]);
-    return cc0;
+    result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+  } else {
+    if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+    }
+  }
+  cc0.scalar[0].real(result0.packet[0][0]);
+  cc0.scalar[0].imag(result0.packet[0][1]);
+  cc0.scalar[1].real(result0.packet[0][2]);
+  cc0.scalar[1].imag(result0.packet[0][3]);
+  return cc0;
 }
 
-template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&, PacketBlock<Packet2d, 4>&)
-{
-    ScalarBlock<std::complex<double>, 2> cc0;
-    EIGEN_UNUSED_VARIABLE(cc0);
-    return cc0;  // Just for compilation
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&,
+                                                                           PacketBlock<Packet2d, 4>&) {
+  ScalarBlock<std::complex<double>, 2> cc0;
+  EIGEN_UNUSED_VARIABLE(cc0);
+  return cc0;  // Just for compilation
 }
 
 /** \internal predux (add elements of a vector) from a MMA accumulator - complex results */
-template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1)
-{
-    PacketBlock<ResPacket, 4> result0, result1;
-    __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    __builtin_mma_disassemble_acc(&result1.packet, acc1);
-    return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<ResPacket, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
 }
 
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0)
-{
-    PacketBlock<ResPacket, 4> result0;
-    __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
-    return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0) {
+  PacketBlock<ResPacket, 4> result0;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  result0.packet[0] =
+      vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
+  return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
 }
 
-template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0)
-{
-    ScalarBlock<ResScalar, 2> cc0;
-    PacketBlock<ResPacket, 4> result0;
-    __builtin_mma_disassemble_acc(&result0.packet, acc0);
-    if (GEMV_IS_COMPLEX_COMPLEX) {
-        if (ConjugateLhs) {
-            result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
-            result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
-        } else if (ConjugateRhs) {
-            result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
-            result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
-        } else {
-            result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
-            result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
-        }
-        result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
-        result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0) {
+  ScalarBlock<ResScalar, 2> cc0;
+  PacketBlock<ResPacket, 4> result0;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    if (ConjugateLhs) {
+      result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+      result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+    } else if (ConjugateRhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
     } else {
-        result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
-        result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
+      result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+      result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
     }
-    cc0.scalar[0].real(result0.packet[0][0]);
-    cc0.scalar[0].imag(result0.packet[0][1]);
-    cc0.scalar[1].real(result0.packet[2][0]);
-    cc0.scalar[1].imag(result0.packet[2][1]);
-    return cc0;
+    result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
+    result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
+  } else {
+    result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
+    result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
+  }
+  cc0.scalar[0].real(result0.packet[0][0]);
+  cc0.scalar[0].imag(result0.packet[0][1]);
+  cc0.scalar[1].real(result0.packet[2][0]);
+  cc0.scalar[1].imag(result0.packet[2][1]);
+  return cc0;
 }
 #endif
 
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(ResPacket& a, ResPacket& b)
-{
-    ScalarBlock<ResScalar, 2> cc0;
-    cc0.scalar[0] = predux(a);
-    cc0.scalar[1] = predux(b);
-    return cc0;
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(ResPacket& a, ResPacket& b) {
+  ScalarBlock<ResScalar, 2> cc0;
+  cc0.scalar[0] = predux(a);
+  cc0.scalar[1] = predux(b);
+  return cc0;
 }
 
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(ResPacket& a, ResPacket& b)
-{
-    return predux_real<ResScalar, ResPacket>(a, b);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(ResPacket& a, ResPacket& b) {
+  return predux_real<ResScalar, ResPacket>(a, b);
 }
 
-#define GEMV_UNROLL_ROW(func, N) \
-  func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
+#define GEMV_UNROLL_ROW(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
 
-#define GEMV_UNROLL_ROW_HALF(func, N) \
-  func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+#define GEMV_UNROLL_ROW_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
 
-#define GEMV_LOADPACKET_ROW(iter) \
-  lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
+#define GEMV_LOADPACKET_ROW(iter) lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
 
 #ifdef USE_GEMV_MMA
-#define GEMV_UNROLL3_ROW(func, N, which) \
-  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
-  func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
+#define GEMV_UNROLL3_ROW(func, N, which)                                                                      \
+  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+      func(6, N, which) func(7, N, which)
 
-#define GEMV_UNUSED_ROW(N, which) \
-  GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
+#define GEMV_UNUSED_ROW(N, which) GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
 
-#define GEMV_INIT_ROW(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
+#define GEMV_INIT_ROW(iter, N)         \
+  if (GEMV_GETN(N) > iter) {           \
     __builtin_mma_xxsetaccz(&c##iter); \
   }
 
 #define GEMV_LOADPAIR_ROW(iter1, iter2) \
   GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1));
 
-#define GEMV_WORK_ROW(iter, N) \
-  if (GEMV_GETN(N) > iter) { \
-    if (GEMV_IS_FLOAT) { \
+#define GEMV_WORK_ROW(iter, N)                                                              \
+  if (GEMV_GETN(N) > iter) {                                                                \
+    if (GEMV_IS_FLOAT) {                                                                    \
       pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \
-    } else { \
-      __vector_pair b##iter; \
-      GEMV_LOADPAIR_ROW(iter, iter << 1) \
-      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0); \
-    } \
+    } else {                                                                                \
+      __vector_pair b##iter;                                                                \
+      GEMV_LOADPAIR_ROW(iter, iter << 1)                                                    \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0);                   \
+    }                                                                                       \
   }
 
-#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
-    if (GEMV_IS_FLOAT) { \
+#define GEMV_PREDUX2(iter1, iter2, iter3, N)                               \
+  if (N > iter1) {                                                         \
+    if (GEMV_IS_FLOAT) {                                                   \
       cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter2, &c##iter3); \
-    } else { \
-      cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(cc##iter1); \
+    } else {                                                               \
+      cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1);            \
+    }                                                                      \
+  } else {                                                                 \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                      \
   }
 #else
-#define GEMV_INIT_ROW(iter, N) \
-  if (N > iter) { \
+#define GEMV_INIT_ROW(iter, N)                \
+  if (N > iter) {                             \
     c##iter = pset1<ResPacket>(ResScalar(0)); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(c##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(c##iter);           \
   }
 
-#define GEMV_WORK_ROW(iter, N) \
-  if (N > iter) { \
+#define GEMV_WORK_ROW(iter, N)                                   \
+  if (N > iter) {                                                \
     c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \
   }
 
-#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
+#define GEMV_PREDUX2(iter1, iter2, iter3, N)                           \
+  if (N > iter1) {                                                     \
     cc##iter1 = predux_real<ResScalar, ResPacket>(c##iter2, c##iter3); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(cc##iter1); \
+  } else {                                                             \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                  \
   }
 #endif
 
-#define GEMV_MULT(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
+#define GEMV_MULT(iter1, iter2, iter3, N)                  \
+  if (N > iter1) {                                         \
     cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \
     cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \
   }
 
-#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
+#define GEMV_STORE_ROW(iter1, iter2, iter3, N)                                           \
+  if (N > iter1) {                                                                       \
     storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
     storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
   }
 
 /** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */
-#define GEMV_PROCESS_ROW(N) \
-  for (; i < n##N; i += N) { \
-    GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
-    Index j = 0; \
-    for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
+#define GEMV_PROCESS_ROW(N)                                       \
+  for (; i < n##N; i += N) {                                      \
+    GEMV_UNROLL_ROW(GEMV_INIT_ROW, N)                             \
+    Index j = 0;                                                  \
+    for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {       \
       RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
-      GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
-    } \
-    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
-    for (; j < cols; ++j) { \
-      RhsScalar a0 = rhs2(j); \
-      GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
-    } \
-    GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
+      GEMV_UNROLL_ROW(GEMV_WORK_ROW, N)                           \
+    }                                                             \
+    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1))                  \
+    for (; j < cols; ++j) {                                       \
+      RhsScalar a0 = rhs2(j);                                     \
+      GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1))                   \
+    }                                                             \
+    GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1))                \
   }
 
-template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_row(
-    Index rows, Index cols,
-    const LhsMapper& alhs,
-    const RhsMapper& rhs,
-    ResScalar* res, Index resIncr,
-    ResScalar alpha)
-{
-    typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+                                  Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
 
-    typedef typename Traits::LhsPacket LhsPacket;
-    typedef typename Traits::RhsPacket RhsPacket;
-    typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-    // The following copy tells the compiler that lhs's attributes are not modified outside this function
-    // This helps GCC to generate proper code.
-    LhsMapper lhs(alhs);
-    typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
 
-    eigen_internal_assert(rhs.stride() == 1);
-    conj_helper<LhsScalar, RhsScalar, false, false> cj;
-    conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, false, false> cj;
+  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
 
-    // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
-    //       processing 8 rows at once might be counter productive wrt cache.
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
-    const Index n4 = rows - 3;
-    const Index n2 = rows - 1;
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
 #endif
 
-    // TODO: for padded aligned inputs, we could enable aligned reads
-    enum {
-        LhsAlignment = Unaligned,
-        ResPacketSize = Traits::ResPacketSize,
-        LhsPacketSize = Traits::LhsPacketSize,
-        RhsPacketSize = Traits::RhsPacketSize,
-    };
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    RhsPacketSize = Traits::RhsPacketSize,
+  };
 
-    Index i = 0;
+  Index i = 0;
 #ifdef USE_GEMV_MMA
-    __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
-    GEMV_UNUSED_ROW(8, c)
+  __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
+  GEMV_UNUSED_ROW(8, c)
 #else
-    ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+  ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
 #endif
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
-    GEMV_PROCESS_ROW(8)
-    GEMV_PROCESS_ROW(4)
-    GEMV_PROCESS_ROW(2)
+  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+  GEMV_PROCESS_ROW(8)
+  GEMV_PROCESS_ROW(4)
+  GEMV_PROCESS_ROW(2)
 #endif
-    for (; i < rows; ++i)
-    {
-        ResPacket d0 = pset1<ResPacket>(ResScalar(0));
-        Index j = 0;
-        for (; j + LhsPacketSize <= cols; j += LhsPacketSize)
-        {
-            RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
+  for (; i < rows; ++i) {
+    ResPacket d0 = pset1<ResPacket>(ResScalar(0));
+    Index j = 0;
+    for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {
+      RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
 
-            d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
-        }
-        ResScalar dd0 = predux(d0);
-        for (; j < cols; ++j)
-        {
-            dd0 += cj.pmul(lhs(i, j), rhs2(j));
-        }
-        res[i * resIncr] += alpha * dd0;
+      d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
     }
+    ResScalar dd0 = predux(d0);
+    for (; j < cols; ++j) {
+      dd0 += cj.pmul(lhs(i, j), rhs2(j));
+    }
+    res[i * resIncr] += alpha * dd0;
+  }
 }
 
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
-{ \
-    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
-\
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        ResScalar* res, Index resIncr, \
-        ResScalar alpha) { \
-        gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar)                                                                   \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper,            \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar;                                       \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha);            \
+    }                                                                                                                  \
+  };
 
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
-{ \
-    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
-\
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        ResScalar* res, Index resIncr, \
-        ResScalar alpha) { \
-        gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar)                                                                   \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper,            \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar;                                       \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha);            \
+    }                                                                                                                  \
+  };
 
 EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float)
 EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double)
@@ -2613,378 +2459,360 @@
 EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double)
 
 #ifdef USE_GEMV_MMA
-#define gemv_bf16_col  gemvMMA_bfloat16_col
-#define gemv_bf16_row  gemvMMA_bfloat16_row
+#define gemv_bf16_col gemvMMA_bfloat16_col
+#define gemv_bf16_row gemvMMA_bfloat16_row
 #else
-#define gemv_bf16_col  gemv_bfloat16_col
-#define gemv_bf16_row  gemv_bfloat16_row
+#define gemv_bf16_col gemv_bfloat16_col
+#define gemv_bf16_row gemv_bfloat16_row
 #endif
 
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16() \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
-{ \
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        bfloat16* res, Index resIncr, \
-        bfloat16 alpha) { \
-        gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()                                                                \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper,        \
+                                       ConjugateRhs, Version> {                                                        \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, bfloat16* res, Index resIncr,            \
+                                                        bfloat16 alpha) {                                              \
+      gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha);                                  \
+    }                                                                                                                  \
+  };
 
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16() \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
-{ \
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        bfloat16* res, Index resIncr, \
-        bfloat16 alpha) { \
-        gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()                                                                \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper,        \
+                                       ConjugateRhs, Version> {                                                        \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, bfloat16* res, Index resIncr,            \
+                                                        bfloat16 alpha) {                                              \
+      gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha);                                  \
+    }                                                                                                                  \
+  };
 
 EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()
 EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()
 
-template<typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1)
-{
-    if (GEMV_IS_COMPLEX_COMPLEX) {
-        a0 = padd(a0, a1);
-        b0 = padd(b0, b1);
-    }
-    return predux_complex<ResScalar, PResPacket>(a0, b0);
+template <typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1,
+                                                             ResPacket& b1) {
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    a0 = padd(a0, a1);
+    b0 = padd(b0, b1);
+  }
+  return predux_complex<ResScalar, PResPacket>(a0, b0);
 }
 
-#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \
-  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
+#define GEMV_LOADPACKET_ROW_COMPLEX(iter) loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
 
-#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \
-  convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
+#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
 
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
-  j = 0; \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N)    \
+  j = 0;                                                  \
   for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
-    const RhsScalar& b1 = rhs2(j); \
-    RhsScalar* b = const_cast<RhsScalar *>(&b1); \
-    GEMV_UNROLL_ROW(which, N) \
+    const RhsScalar& b1 = rhs2(j);                        \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1);           \
+    GEMV_UNROLL_ROW(which, N)                             \
   }
 
-#define GEMV_PROCESS_END_ROW_COMPLEX(N) \
-  for (; j < cols; ++j) { \
-    RhsScalar b0 = rhs2(j); \
+#define GEMV_PROCESS_END_ROW_COMPLEX(N)               \
+  for (; j < cols; ++j) {                             \
+    RhsScalar b0 = rhs2(j);                           \
     GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
-  } \
+  }                                                   \
   GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
 
 #ifdef USE_GEMV_MMA
 #define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    __builtin_mma_xxsetaccz(&e0##iter); \
+  if (GEMV_GETN_COMPLEX(N) > iter) {       \
+    __builtin_mma_xxsetaccz(&e0##iter);    \
   }
 
 #define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \
   GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1));
 
-#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \
-  if (GEMV_GETN_COMPLEX(N) > iter) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
-    } else { \
-      __vector_pair a##iter; \
-      GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \
-      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
-    } \
+#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N)                                                                       \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter);                                                    \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter);                        \
+    } else {                                                                                                     \
+      __vector_pair a##iter;                                                                                     \
+      GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1)                                                             \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter);                        \
+    }                                                                                                            \
   }
 
-#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
-    if (GEMV_IS_COMPLEX_FLOAT) { \
-      cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter2, &e0##iter3); \
-    } else { \
-      cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
-    } \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(cc##iter1); \
+#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N)                                                         \
+  if (N > iter1) {                                                                                               \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(     \
+          &e0##iter2, &e0##iter3);                                                                               \
+    } else {                                                                                                     \
+      cc##iter1 =                                                                                                \
+          predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
+    }                                                                                                            \
+  } else {                                                                                                       \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                                                            \
   }
 
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N)  \
   GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \
   GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N)
 
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \
-  for (; i < n##N; i += N) { \
-    GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)                  \
+  for (; i < n##N; i += N) {                                 \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N)                   \
     GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \
-    GEMV_PROCESS_END_ROW_COMPLEX(N); \
+    GEMV_PROCESS_END_ROW_COMPLEX(N);                         \
   }
 #endif
 
-#define GEMV_WORK_ROW_COMPLEX(iter, N) \
-  if (N > iter) { \
-    PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
-    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter); \
+#define GEMV_WORK_ROW_COMPLEX(iter, N)                                                                     \
+  if (N > iter) {                                                                                          \
+    PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter);                                                \
+    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+                      ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter);                             \
   }
 
-#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
-    cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(cc##iter1); \
+#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N)                                                          \
+  if (N > iter1) {                                                                                            \
+    cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3,  \
+                                                                                       c1##iter2, c1##iter3); \
+  } else {                                                                                                    \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                                                         \
   }
 
-#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
+#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N)          \
+  if (N > iter1) {                                         \
     cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \
     cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \
   }
 
-#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
+#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N)                                   \
+  if (N > iter1) {                                                                       \
     storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
     storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
   }
 
 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
-  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \
+  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N)        \
   GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N)
 
-/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store results */
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
-  for (; i < n##N; i += N) { \
-    GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
+/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store
+ * results */
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)              \
+  for (; i < n##N; i += N) {                             \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)               \
     GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \
-    GEMV_PROCESS_END_ROW_COMPLEX(N); \
+    GEMV_PROCESS_END_ROW_COMPLEX(N);                     \
   }
 
 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
-  if (GEMV_IS_COMPLEX_COMPLEX) { \
-    c0##iter = padd(c0##iter, c1##iter); \
-  } \
+  if (GEMV_IS_COMPLEX_COMPLEX) {                  \
+    c0##iter = padd(c0##iter, c1##iter);          \
+  }                                               \
   dd0 = predux(c0##iter);
 
 #if EIGEN_COMP_LLVM
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
-  GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
 
-#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
-  GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
 
-#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
-  GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
 #else
 // gcc seems to be reading and writing registers unnecessarily to memory.
 // Use the old way for complex double until it is fixed.
 
-#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \
-  lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
+#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
 
 #define GEMV_INIT_COMPLEX_OLD(iter, N) \
-  EIGEN_UNUSED_VARIABLE(c0##iter); \
-  if (N > iter) { \
+  EIGEN_UNUSED_VARIABLE(c0##iter);     \
+  if (N > iter) {                      \
     c1##iter = pset_zero<ResPacket>(); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(c1##iter); \
+  } else {                             \
+    EIGEN_UNUSED_VARIABLE(c1##iter);   \
   }
 
-#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \
-  if (N > iter) { \
+#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N)                     \
+  if (N > iter) {                                              \
     LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \
-    c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \
+    c1##iter = pcj.pmadd(a##iter, b0, c1##iter);               \
   }
 
 #define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \
-  if (N > iter1) { \
-    cc##iter1.scalar[0] = predux(c1##iter2); \
-    cc##iter1.scalar[1] = predux(c1##iter3); \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(cc##iter1); \
+  if (N > iter1) {                                       \
+    cc##iter1.scalar[0] = predux(c1##iter2);             \
+    cc##iter1.scalar[1] = predux(c1##iter3);             \
+  } else {                                               \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                    \
   }
 
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
-  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
-  j = 0; \
-  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N)                  \
+  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N)                     \
+  j = 0;                                                        \
+  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {       \
     RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
-    GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
+    GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N)               \
   }
 
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
-  for (; i < n##N; i += N) { \
-    GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N)                  \
+  for (; i < n##N; i += N) {                                 \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N)                   \
     GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \
-    GEMV_PROCESS_END_ROW_COMPLEX(N) \
+    GEMV_PROCESS_END_ROW_COMPLEX(N)                          \
   }
 
-#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
-  dd0 = predux(c1##iter);
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) dd0 = predux(c1##iter);
 
 #if (__GNUC__ > 10)
-#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW  1
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
 #else
-#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW  \
-  (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
 #endif
 
 #define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
-  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
+  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) {   \
     GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
-  } else { \
+  } else {                                 \
     GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
   }
 
-#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N)  \
   if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
-    GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
-  } else { \
-    GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
+    GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)  \
+  } else {                               \
+    GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N)  \
   }
 
 #define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
-  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
+  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) {      \
     GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
-  } else { \
+  } else {                                    \
     GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
   }
 #endif
 
 #ifdef USE_GEMV_MMA
-#define GEMV_PROCESS_ROW_COMPLEX(N) \
-  GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
 #else
-#define GEMV_PROCESS_ROW_COMPLEX(N) \
-  GEMV_PROCESS_ROW_COMPLEX_ONE(N)
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE(N)
 #endif
 
-template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_complex_row(
-    Index rows, Index cols,
-    const LhsMapper& alhs,
-    const RhsMapper& rhs,
-    ResScalar* res, Index resIncr,
-    ResScalar alpha)
-{
-    typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                          ResScalar* res, Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
 
-    typedef typename Traits::LhsPacket LhsPacket;
-    typedef typename Traits::RhsPacket RhsPacket;
-    typedef typename Traits::ResPacket ResPacket;
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
 
-    typedef typename packet_traits<Scalar>::type ScalarPacket;
-    typedef typename packet_traits<LhsScalar>::type PLhsPacket;
-    typedef typename packet_traits<ResScalar>::type PResPacket;
-    typedef gemv_traits<ResPacket, ResPacket> PTraits;
+  typedef typename packet_traits<Scalar>::type ScalarPacket;
+  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+  typedef typename packet_traits<ResScalar>::type PResPacket;
+  typedef gemv_traits<ResPacket, ResPacket> PTraits;
 
-    // The following copy tells the compiler that lhs's attributes are not modified outside this function
-    // This helps GCC to generate proper code.
-    LhsMapper lhs(alhs);
-    typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
 
-    eigen_internal_assert(rhs.stride() == 1);
-    conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
 #if !EIGEN_COMP_LLVM
-    conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
 #endif
 
-    // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
-    //       processing 8 rows at once might be counter productive wrt cache.
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
-    const Index n4 = rows - 3;
-    const Index n2 = rows - 1;
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
 #endif
 
-    // TODO: for padded aligned inputs, we could enable aligned reads
-    enum {
-        LhsAlignment = Unaligned,
-        ResPacketSize = PTraits::ResPacketSize,
-        LhsPacketSize = PTraits::LhsPacketSize,
-        RhsPacketSize = PTraits::RhsPacketSize,
-    };
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = PTraits::ResPacketSize,
+    LhsPacketSize = PTraits::LhsPacketSize,
+    RhsPacketSize = PTraits::RhsPacketSize,
+  };
 
-    Index i = 0, j;
-    PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
-    ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+  Index i = 0, j;
+  PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+  ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
 #ifdef USE_GEMV_MMA
-    __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
-    GEMV_UNUSED_ROW(8, e0)
-    GEMV_UNUSED_EXTRA(1, c0)
-    GEMV_UNUSED_EXTRA(1, c1)
+  __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+  GEMV_UNUSED_ROW(8, e0)
+  GEMV_UNUSED_EXTRA(1, c0)
+  GEMV_UNUSED_EXTRA(1, c1)
 #endif
-    ResScalar dd0;
+  ResScalar dd0;
 #ifndef GCC_ONE_VECTORPAIR_BUG
-    ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
 #ifdef USE_GEMV_MMA
-    if (!GEMV_IS_COMPLEX_COMPLEX)
+  if (!GEMV_IS_COMPLEX_COMPLEX)
 #endif
-    {
-        GEMV_PROCESS_ROW_COMPLEX(8)
-    }
-    GEMV_PROCESS_ROW_COMPLEX(4)
-    GEMV_PROCESS_ROW_COMPLEX(2)
+  {
+    GEMV_PROCESS_ROW_COMPLEX(8)
+  }
+  GEMV_PROCESS_ROW_COMPLEX(4)
+  GEMV_PROCESS_ROW_COMPLEX(2)
 #endif
-    for (; i < rows; ++i)
-    {
-        GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
-        GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
-        for (; j < cols; ++j)
-        {
-            dd0 += cj.pmul(lhs(i, j), rhs2(j));
-        }
-        res[i * resIncr] += alpha * dd0;
+  for (; i < rows; ++i) {
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
+    GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
+    for (; j < cols; ++j) {
+      dd0 += cj.pmul(lhs(i, j), rhs2(j));
     }
+    res[i * resIncr] += alpha * dd0;
+  }
 }
 
-#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
-{ \
-    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
-\
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        ResScalar* res, Index resIncr, \
-        ResScalar alpha) { \
-        gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar)                                          \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,      \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;                                 \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar,     \
+                       RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs,  \
+                                                                                                res, resIncr, alpha);  \
+    }                                                                                                                  \
+  };
 
-#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
-{ \
-    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
-\
-    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
-        Index rows, Index cols, \
-        const LhsMapper& lhs, \
-        const RhsMapper& rhs, \
-        ResScalar* res, Index resIncr, \
-        ResScalar alpha) { \
-        gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
-    } \
-};
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar)                                          \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,      \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;                                 \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar,     \
+                       RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs,  \
+                                                                                                res, resIncr, alpha);  \
+    }                                                                                                                  \
+  };
 
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float,  float,                std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float,  std::complex<float>,  float)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float,  std::complex<float>,  std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double,               std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex<double>)
 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, double)
 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, std::complex<double>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float,  float,                std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float,  std::complex<float>,  float)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float,  std::complex<float>,  std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double,               std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex<double>)
 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, double)
 EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, std::complex<double>)
 
-#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
-
+#endif  // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b945b33..414f05c 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -27,127 +27,132 @@
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-typedef __vector float                   Packet4f;
-typedef __vector int                     Packet4i;
-typedef __vector unsigned int            Packet4ui;
-typedef __vector __bool int              Packet4bi;
-typedef __vector short int               Packet8s;
-typedef __vector unsigned short int      Packet8us;
-typedef __vector __bool short            Packet8bi;
-typedef __vector signed char             Packet16c;
-typedef __vector unsigned char           Packet16uc;
-typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
+typedef __vector float Packet4f;
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8s;
+typedef __vector unsigned short int Packet8us;
+typedef __vector __bool short Packet8bi;
+typedef __vector signed char Packet16c;
+typedef __vector unsigned char Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
-#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
 
-#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = vec_splat_s32(X)
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
 
-#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
-  Packet4ui p4ui_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
 
-#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
-  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
 
-#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
+#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
   Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
 
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
-  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
-#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
-  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
   const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
 
 // These constants are endian-agnostic
-static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);       //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);       //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);        //{ 1, 1, 1, 1}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16);  //{ -16, -16, -16, -16}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);    //{ -1, -1, -1, -1}
 static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
 static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
-static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
-static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1);  //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static Packet4f p4f_MZERO =
+    (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);  //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);  //{ 1.0, 1.0, 1.0, 1.0}
 #endif
 
-static Packet4f  p4f_COUNTDOWN  = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i  p4i_COUNTDOWN  = { 0, 1, 2, 3 };
-static Packet8s  p8s_COUNTDOWN  = { 0, 1, 2, 3, 4, 5, 6, 7 };
-static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
+static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
 
-static Packet16c  p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
-                                    8, 9, 10, 11, 12, 13, 14, 15};
-static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 
-                                    8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
-static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
 #ifndef _ARCH_PWR9
-static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
+static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
 #endif
 
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
 #endif
-static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
-static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
+static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
+static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
 
-static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
+static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
 
-static Packet16uc p16uc_MERGEE16 = { 0,1, 16,17, 4,5, 20,21, 8,9, 24,25, 12,13, 28,29 };
-static Packet16uc p16uc_MERGEO16 = { 2,3, 18,19, 6,7, 22,23, 10,11, 26,27, 14,15, 30,31 };
+static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_MERGEH16 = { 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 };
+static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
 #else
-static Packet16uc p16uc_MERGEL16 = { 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 };
+static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
 #endif
 
 // Handle endianness properly while loading constants
 // Define global static constants:
 #ifdef _BIG_ENDIAN
 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+                                               8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
+                                              8);  //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-#endif // _BIG_ENDIAN
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
+                                              8);  //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif  // _BIG_ENDIAN
 
-static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16;                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16;                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI =
+    p16uc_PSET64_HI + p16uc_HALF64_0_16;  //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO =
+    p16uc_PSET64_LO + p16uc_HALF64_0_16;  //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV =
+    vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);  //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
-  #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #else
-  #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#define EIGEN_PPC_PREFETCH(ADDR) asm("   dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
 #endif
 
 #if EIGEN_COMP_LLVM
@@ -256,14 +261,14 @@
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAdd   = 1,
-    HasSub   = 1,
+    HasAdd = 1,
+    HasSub = 1,
     HasShift = 1,
-    HasMul   = 1,
-#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
-    HasDiv   = 1,
+    HasMul = 1,
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+    HasDiv = 1,
 #else
-    HasDiv   = 0,
+    HasDiv = 0,
 #endif
     HasBlend = 1,
     HasCmp = 1
@@ -279,10 +284,10 @@
     AlignedOnScalar = 1,
     size = 8,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
     HasBlend = 1,
     HasCmp = 1
   };
@@ -297,10 +302,10 @@
     AlignedOnScalar = 1,
     size = 8,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
     HasBlend = 1,
     HasCmp = 1
   };
@@ -315,10 +320,10 @@
     AlignedOnScalar = 1,
     size = 16,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
     HasBlend = 1,
     HasCmp = 1
   };
@@ -333,88 +338,125 @@
     AlignedOnScalar = 1,
     size = 16,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
     HasBlend = 1,
     HasCmp = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f>
-{
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet4i  integer_packet;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet4i>
-{
-  typedef int       type;
-  typedef Packet4i  half;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet8s>
-{
+template <>
+struct unpacket_traits<Packet8s> {
   typedef short int type;
-  typedef Packet8s  half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet8us>
-{
+template <>
+struct unpacket_traits<Packet8us> {
   typedef unsigned short int type;
-  typedef Packet8us          half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8us half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
-template<> struct unpacket_traits<Packet16c>
-{
+template <>
+struct unpacket_traits<Packet16c> {
   typedef signed char type;
-  typedef Packet16c  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet16c half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet16uc>
-{
+template <>
+struct unpacket_traits<Packet16uc> {
   typedef unsigned char type;
-  typedef Packet16uc  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet16uc half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
-template<> struct unpacket_traits<Packet8bf>
-{
+template <>
+struct unpacket_traits<Packet8bf> {
   typedef bfloat16 type;
-  typedef Packet8bf          half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet8bf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) {
   union {
-    Packet16c   v;
+    Packet16c v;
     signed char n[16];
   } vt;
   vt.v = v;
-  for (int i=0; i< 16; i++)
-    s << vt.n[i] << ", ";
+  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) {
   union {
-    Packet16uc   v;
+    Packet16uc v;
     unsigned char n[16];
   } vt;
   vt.v = v;
-  for (int i=0; i< 16; i++)
-    s << vt.n[i] << ", ";
+  for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
   union {
-    Packet4f   v;
+    Packet4f v;
     float n[4];
   } vt;
   vt.v = v;
@@ -422,10 +464,9 @@
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
   union {
-    Packet4i   v;
+    Packet4i v;
     int n[4];
   } vt;
   vt.v = v;
@@ -433,10 +474,9 @@
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
   union {
-    Packet4ui   v;
+    Packet4ui v;
     unsigned int n[4];
   } vt;
   vt.v = v;
@@ -445,8 +485,7 @@
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
-{
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
   // some versions of GCC throw "unused-but-set-parameter".
   // ignoring these warnings for now.
   EIGEN_UNUSED_VARIABLE(from);
@@ -459,52 +498,51 @@
 }
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
   return pload_common<Packet4f>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
   return pload_common<Packet4i>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
   return pload_common<Packet8s>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
   return pload_common<Packet8us>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
   return pload_common<Packet16c>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
   return pload_common<Packet16uc>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
   return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
 }
 
 template <typename Packet>
-EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
-{
+EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
   // some versions of GCC throw "unused-but-set-parameter".
   // ignoring these warnings for now.
   EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
   // Ignore partial input memory initialized
 #if !EIGEN_COMP_LLVM
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
 #ifdef EIGEN_VECTORIZE_VSX
   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
@@ -512,18 +550,18 @@
   return vec_ld(0, from);
 #endif
 #if !EIGEN_COMP_LLVM
-  #pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
 #endif
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16*     from)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
   return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
 }
 
 template <typename Packet>
-EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
-{
+EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+                                                const Index offset) {
   // some versions of GCC throw "unused-but-set-parameter".
   // ignoring these warnings for now.
   const Index packet_size = unpacket_traits<Packet>::size;
@@ -546,13 +584,13 @@
 #else
   if (n) {
     EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
-    unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
-    unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+    unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+    unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
     Index n2 = n * size;
     if (16 <= n2) {
       pstoreu(load2, ploadu<Packet16uc>(from2));
     } else {
-      memcpy((void *)load2, (void *)from2, n2);
+      memcpy((void*)load2, (void*)from2, n2);
     }
     return pload_ignore<Packet>(load);
   } else {
@@ -561,43 +599,44 @@
 #endif
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
   return pload_partial_common<Packet4f>(from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
   return pload_partial_common<Packet4i>(from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
   return pload_partial_common<Packet8s>(from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
+                                                       const Index offset) {
   return pload_partial_common<Packet8us>(from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
   return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
   return pload_partial_common<Packet16c>(from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
   return pload_partial_common<Packet16uc>(from, n, offset);
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
   // some versions of GCC throw "unused-but-set-parameter" (float *to).
   // ignoring these warnings for now.
   EIGEN_UNUSED_VARIABLE(to);
@@ -609,43 +648,44 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
   pstore_common<Packet4f>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
   pstore_common<Packet4i>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<short int>(short int*       to, const Packet8s& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
   pstore_common<Packet8s>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int*       to, const Packet8us& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
   pstore_common<Packet8us>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16*       to, const Packet8bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
   pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char*       to, const Packet16c& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
   pstore_common<Packet16c>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char*       to, const Packet16uc& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
   pstore_common<Packet16uc>(to, from);
 }
 
-template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+                                               const Index offset) {
   // some versions of GCC throw "unused-but-set-parameter" (float *to).
   // ignoring these warnings for now.
   const Index packet_size = unpacket_traits<Packet>::size;
@@ -669,110 +709,119 @@
   if (n) {
     EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
     pstore(store, from);
-    unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
-    unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+    unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+    unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
     Index n2 = n * size;
     if (16 <= n2) {
       pstore(to2, ploadu<Packet16uc>(store2));
     } else {
-      memcpy((void *)to2, (void *)store2, n2);
+      memcpy((void*)to2, (void*)store2, n2);
     }
   }
 #endif
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float*  to, const Packet4f& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
   pstore_partial_common<Packet4f>(to, from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int*  to, const Packet4i& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
   pstore_partial_common<Packet4i>(to, from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int*  to, const Packet8s& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
+                                                   const Index offset) {
   pstore_partial_common<Packet8s>(to, from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int*  to, const Packet8us& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+                                                            const Index n, const Index offset) {
   pstore_partial_common<Packet8us>(to, from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16*      to, const Packet8bf& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+                                                  const Index offset) {
   pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char*  to, const Packet16c& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+                                                     const Index offset) {
   pstore_partial_common<Packet16c>(to, from, n, offset);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char*  to, const Packet16uc& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+                                                       const Index offset) {
   pstore_partial_common<Packet16uc>(to, from, n, offset);
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
   Packet v = {from, from, from, from};
   return v;
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
   Packet v = {from, from, from, from, from, from, from, from};
   return v;
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
   Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
   return v;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
   return pset1_size4<Packet4f>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
   return pset1_size4<Packet4i>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
   return pset1_size8<Packet8s>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
   return pset1_size8<Packet8us>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
   return pset1_size16<Packet16c>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
   return pset1_size16<Packet16uc>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
   return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16&    from)   {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
   return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE void
-pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
-                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
+                                            Packet& a3) {
   a3 = pload<Packet>(a);
   a0 = vec_splat(a3, 0);
   a1 = vec_splat(a3, 1);
@@ -780,21 +829,18 @@
   a3 = vec_splat(a3, 3);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
   pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
-                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
   pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
-{
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
+                                                            const Index n = unpacket_traits<Packet>::size) {
   EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
   eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
   if (stride == 1) {
@@ -806,85 +852,97 @@
   } else {
     LOAD_STORE_UNROLL_16
     for (Index i = 0; i < n; i++) {
-      a[i] = from[i*stride];
+      a[i] = from[i * stride];
     }
     // Leave rest of the array uninitialized
     return pload_ignore<Packet>(a);
   }
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
   return pgather_common<Packet4f>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
   return pgather_common<Packet4i>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
   return pgather_common<Packet8s>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
+                                                                                       Index stride) {
   return pgather_common<Packet8us>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
   return pgather_common<Packet8bf>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
   return pgather_common<Packet16c>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
+                                                                                    Index stride) {
   return pgather_common<Packet16uc>(from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
+                                                                                const Index n) {
   return pgather_common<Packet4f>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
+                                                                              const Index n) {
   return pgather_common<Packet4i>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
+                                                                                    const Index n) {
   return pgather_common<Packet8s>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
+pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
   return pgather_common<Packet8us>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
+                                                                                     const Index n) {
   return pgather_common<Packet8bf>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
+                                                                                        Index stride, const Index n) {
   return pgather_common<Packet16c>(from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
+                                                                                            Index stride,
+                                                                                            const Index n) {
   return pgather_common<Packet16uc>(from, stride, n);
 }
 
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
-{
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
+                                                           Index stride,
+                                                           const Index n = unpacket_traits<Packet>::size) {
   EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
   eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
   if (stride == 1) {
@@ -897,129 +955,203 @@
     pstore<__UNPACK_TYPE__(Packet)>(a, from);
     LOAD_STORE_UNROLL_16
     for (Index i = 0; i < n; i++) {
-      to[i*stride] = a[i];
+      to[i * stride] = a[i];
     }
   }
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
   pscatter_common<Packet4f>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
   pscatter_common<Packet4i>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
+                                                                         Index stride) {
   pscatter_common<Packet8s>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
+                                                                                   const Packet8us& from,
+                                                                                   Index stride) {
   pscatter_common<Packet8us>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+                                                                         Index stride) {
   pscatter_common<Packet8bf>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
+                                                                            Index stride) {
   pscatter_common<Packet16c>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
+                                                                               const Packet16uc& from, Index stride) {
   pscatter_common<Packet16uc>(to, from, stride);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
+                                                                             Index stride, const Index n) {
   pscatter_common<Packet4f>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
+                                                                           const Index n) {
   pscatter_common<Packet4i>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
+                                                                                 Index stride, const Index n) {
   pscatter_common<Packet8s>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
+                                                                                           const Packet8us& from,
+                                                                                           Index stride,
+                                                                                           const Index n) {
   pscatter_common<Packet8us>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+                                                                                 Index stride, const Index n) {
   pscatter_common<Packet8bf>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
+                                                                                    const Packet16c& from, Index stride,
+                                                                                    const Index n) {
   pscatter_common<Packet16c>(to, from, stride, n);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
+                                                                                       const Packet16uc& from,
+                                                                                       Index stride, const Index n) {
   pscatter_common<Packet16uc>(to, from, stride, n);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
-template<> EIGEN_STRONG_INLINE Packet4i   plset<Packet4i>(const int&       a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN;  }
-template<> EIGEN_STRONG_INLINE Packet8s   plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet8us  plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16c  plset<Packet16c>(const signed char& a)   { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a)   { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return pset1<Packet4f>(a) + p4f_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return pset1<Packet4i>(a) + p4i_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
+  return pset1<Packet8s>(a) + p8s_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
+  return pset1<Packet8us>(a) + p8us_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
+  return pset1<Packet16c>(a) + p16c_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
+  return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f   padd<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4i   padd<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4ui   padd<Packet4ui>  (const Packet4ui&   a, const Packet4ui&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8s   padd<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8us  padd<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16c  padd<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return a + b;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f   psub<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4i   psub<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet8s   psub<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return a - b;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
   return vec_xor(a, p4f_MZERO);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
   return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
   return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
@@ -1027,19 +1159,42 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f   pmul<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return vec_madd(a,b, p4f_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet4i   pmul<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a * b; }
-template<> EIGEN_STRONG_INLINE Packet8s   pmul<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us  pmul<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c  pmul<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_madd(a, b, p4f_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a * b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_mul(a, b);
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
 #ifndef __VSX__  // VSX actually provides a div instruction
   Packet4f t, y_0, y_1;
 
@@ -1047,7 +1202,7 @@
   y_0 = vec_re(b);
 
   // Do one Newton-Raphson iteration to get the needed accuracy
-  t   = vec_nmsub(y_0, b, p4f_ONE);
+  t = vec_nmsub(y_0, b, p4f_ONE);
   y_1 = vec_madd(y_0, t, y_0);
 
   return vec_madd(a, y_1, p4f_MZERO);
@@ -1056,9 +1211,9 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
   return vec_div(a, b);
 #else
   EIGEN_UNUSED_VARIABLE(a);
@@ -1069,154 +1224,302 @@
 }
 
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
-template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return a * b + c;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return vec_madd(a, b, c);
+}
 
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_nmadd(a, b, c);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  #ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
   // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet4f ret;
-  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
   return ret;
-  #else
+#else
   return vec_min(a, b);
-  #endif
+#endif
 }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_min(a, b);
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  #ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
   // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet4f ret;
-  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
   return ret;
-  #else
+#else
   return vec_max(a, b);
-  #endif
+#endif
 }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_max(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmple(a, b));
+}
 // To fix bug with vec_cmplt on older versions
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
-  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
-  return vec_nor(c,c);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
+  return vec_nor(c, c);
 }
 
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmple(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
+}
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmple(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
+}
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmple(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
+}
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmple(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
+}
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
+}
 #endif
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return pand<Packet8us>(a, b);
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return por<Packet8us>(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   return pxor<Packet8us>(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_andc(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_andc(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
   return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
-    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
-    Packet4f res;
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f t = vec_add(
+      reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+  Packet4f res;
 
 #ifdef EIGEN_VECTORIZE_VSX
-    __asm__("xvrspiz %x0, %x1\n\t"
-        : "=&wa" (res)
-        : "wa" (t));
+  __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
 #else
-    __asm__("vrfiz %0, %1\n\t"
-        : "=v" (res)
-        : "v" (t));
+  __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
 #endif
 
-    return res;
+  return res;
 }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vec_floor(a);
+}
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
-{
-    Packet4f res;
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  Packet4f res;
 
-    __asm__("xvrspic %x0, %x1\n\t"
-        : "=&wa" (res)
-        : "wa" (a));
+  __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
 
-    return res;
+  return res;
 }
 #endif
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
   EIGEN_DEBUG_ALIGNED_LOAD
 #if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
   EIGEN_DEBUG_UNALIGNED_LOAD
@@ -1224,45 +1527,46 @@
 #else
   Packet16uc MSQ, LSQ;
   Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  //TODO: Add static_cast here
-  return (Packet) vec_perm(MSQ, LSQ, mask);           // align the data
+  MSQ = vec_ld(0, (unsigned char*)from);   // most significant quadword
+  LSQ = vec_ld(15, (unsigned char*)from);  // least significant quadword
+  mask = vec_lvsl(0, from);                // create the permute mask
+  // TODO: Add static_cast here
+  return (Packet)vec_perm(MSQ, LSQ, mask);  // align the data
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
   return ploadu_common<Packet4f>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
   return ploadu_common<Packet4i>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
   return ploadu_common<Packet8s>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
   return ploadu_common<Packet8us>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
   return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
 }
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
   return ploadu_common<Packet16c>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
   return ploadu_common<Packet16uc>(from);
 }
 
-template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+                                                 const Index offset) {
   const Index packet_size = unpacket_traits<Packet>::size;
   eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
@@ -1283,13 +1587,13 @@
 #else
   if (n) {
     EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
-    unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
-    unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+    unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+    unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
     Index n2 = n * size;
     if (16 <= n2) {
       pstoreu(load2, ploadu<Packet16uc>(from2));
     } else {
-      memcpy((void *)load2, (void *)from2, n2);
+      memcpy((void*)load2, (void*)from2, n2);
     }
     return pload_ignore<Packet>(load);
   } else {
@@ -1298,106 +1602,122 @@
 #endif
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet4f>(from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet4i>(from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet8s>(from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
+                                                        const Index offset) {
   return ploadu_partial_common<Packet8us>(from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet16c>(from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
+                                                          const Index offset) {
   return ploadu_partial_common<Packet16uc>(from, n, offset);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)*   from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
   Packet p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet>(from);
-  else                                  p = ploadu<Packet>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet>(from);
+  else
+    p = ploadu<Packet>(from);
   return vec_mergeh(p, p);
 }
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
   return ploaddup_common<Packet4f>(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
   return ploaddup_common<Packet4i>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
   Packet8s p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
-  else                                  p = ploadu<Packet8s>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8s>(from);
+  else
+    p = ploadu<Packet8s>(from);
   return vec_mergeh(p, p);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
   Packet8us p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
-  else                                  p = ploadu<Packet8us>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8us>(from);
+  else
+    p = ploadu<Packet8us>(from);
   return vec_mergeh(p, p);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
   Packet8s p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
-  else                                  p = ploadu<Packet8s>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8s>(from);
+  else
+    p = ploadu<Packet8s>(from);
   return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
   Packet8us p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
-  else                                  p = ploadu<Packet8us>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8us>(from);
+  else
+    p = ploadu<Packet8us>(from);
   return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
   return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
   Packet16c p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16c>(from);
-  else                                  p = ploadu<Packet16c>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16c>(from);
+  else
+    p = ploadu<Packet16c>(from);
   return vec_mergeh(p, p);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
   Packet16uc p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16uc>(from);
-  else                                  p = ploadu<Packet16uc>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16uc>(from);
+  else
+    p = ploadu<Packet16uc>(from);
   return vec_mergeh(p, p);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
   EIGEN_DEBUG_UNALIGNED_STORE
 #if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
   vec_xst(from, 0, to);
@@ -1407,48 +1727,49 @@
   Packet16uc MSQ, LSQ, edges;
   Packet16uc edgeAlign, align;
 
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
+  MSQ = vec_ld(0, (unsigned char*)to);             // most significant quadword
+  LSQ = vec_ld(15, (unsigned char*)to);            // least significant quadword
+  edgeAlign = vec_lvsl(0, to);                     // permute map to extract edges
+  edges = vec_perm(LSQ, MSQ, edgeAlign);           // extract the edges
+  align = vec_lvsr(0, to);                         // permute map to misalign data
+  MSQ = vec_perm(edges, (Packet16uc)from, align);  // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc)from, edges, align);  // misalign the data (LSQ)
+  vec_st(LSQ, 15, (unsigned char*)to);             // Store the LSQ part first
+  vec_st(MSQ, 0, (unsigned char*)to);              // Store the MSQ part second
 #endif
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
   pstoreu_common<Packet4f>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
   pstoreu_common<Packet4i>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*      to, const Packet8s& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
   pstoreu_common<Packet8s>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*      to, const Packet8us& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
   pstoreu_common<Packet8us>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16*      to, const Packet8bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
   pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char*      to, const Packet16c& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
   pstoreu_common<Packet16c>(to, from);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char*      to, const Packet16uc& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
   pstoreu_common<Packet16uc>(to, from);
 }
 
-template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+                                                const Index offset) {
   const Index packet_size = unpacket_traits<Packet>::size;
   eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
   const Index size = sizeof(__UNPACK_TYPE__(Packet));
@@ -1469,181 +1790,237 @@
   if (n) {
     EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
     pstore(store, from);
-    unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
-    unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+    unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+    unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
     Index n2 = n * size;
     if (16 <= n2) {
       pstoreu(to2, ploadu<Packet16uc>(store2));
     } else {
-      memcpy((void *)to2, (void *)store2, n2);
+      memcpy((void*)to2, (void*)store2, n2);
     }
   }
 #endif
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float*  to, const Packet4f& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
   pstoreu_partial_common<Packet4f>(to, from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int*  to, const Packet4i& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
   pstoreu_partial_common<Packet4i>(to, from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int*  to, const Packet8s& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
+                                                    const Index offset) {
   pstoreu_partial_common<Packet8s>(to, from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int*  to, const Packet8us& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+                                                             const Index n, const Index offset) {
   pstoreu_partial_common<Packet8us>(to, from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16*      to, const Packet8bf& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+                                                   const Index offset) {
   pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char*  to, const Packet16c& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+                                                      const Index offset) {
   pstoreu_partial_common<Packet16c>(to, from, n, offset);
 }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char*  to, const Packet16uc& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+                                                        const Index offset) {
   pstoreu_partial_common<Packet16uc>(to, from, n, offset);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int x;
+  vec_ste(a, 0, &x);
+  return x;
+}
 
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
   EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
   vec_ste(a, 0, &x);
   return x;
 }
 
-template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+template <>
+EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
   return pfirst_common<Packet8s>(a);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+template <>
+EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
   return pfirst_common<Packet8us>(a);
 }
 
-template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
   return pfirst_common<Packet16c>(a);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
   return pfirst_common<Packet16uc>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return reinterpret_cast<Packet4f>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
 }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
-  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return reinterpret_cast<Packet4i>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
 }
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
-  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+  return reinterpret_cast<Packet8s>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
 }
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
-  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+  return reinterpret_cast<Packet8us>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
 }
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
 #ifdef _ARCH_PWR9
   return vec_revb(a);
 #else
   return vec_perm(a, a, p16uc_REVERSE8);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
 #ifdef _ARCH_PWR9
   return vec_revb(a);
 #else
   return vec_perm(a, a, p16uc_REVERSE8);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
   return preverse<Packet8us>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8bf  pabs(const Packet8bf& a) {
-  EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
   return pand<Packet8us>(p8us_abs_mask, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return vec_sra(a.m_val, vec_splat_u16(15)); }
-template<> EIGEN_STRONG_INLINE Packet4f  psignbit(const Packet4f&  a) { return  (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31))); }
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+  return vec_sra(a.m_val, vec_splat_u16(15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
+}
 
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
-{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
-{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
-{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
   return reinterpret_cast<Packet4f>(r);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
   return reinterpret_cast<Packet4f>(r);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   return vec_sr(a, p4ui_mask);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
   return vec_sl(a, p4ui_mask);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
   return vec_sl(a, p8us_mask);
 }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
   const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
   return vec_sr(a, p8us_mask);
 }
 
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
   return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
 }
 
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
-  return pand<Packet4f>(
-    reinterpret_cast<Packet4f>(bf.m_val),
-    reinterpret_cast<Packet4f>(p4ui_high_mask)
-  );
+  return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
 }
 
 EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
@@ -1660,20 +2037,20 @@
   return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
 }
 
-//#define SUPPORT_BF16_SUBNORMALS
+// #define SUPPORT_BF16_SUBNORMALS
 
 #ifndef __VEC_CLASS_FP_NAN
-#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
 #if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
-#define __VEC_CLASS_FP_SUBNORMAL_P (1<<1)
-#define __VEC_CLASS_FP_SUBNORMAL_N (1<<0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
 
 #define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
 #endif
 
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
 #ifdef _ARCH_PWR10
   return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
 #else
@@ -1681,7 +2058,7 @@
   Packet4ui lsb = plogical_shift_right<16>(input);
   lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
 
-  EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
+  EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
   Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
   input = padd<Packet4ui>(input, rounding_bias);
 
@@ -1696,7 +2073,7 @@
 #endif
 #else
 #ifdef SUPPORT_BF16_SUBNORMALS
-  //Test NaN and Subnormal
+  // Test NaN and Subnormal
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
   Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
 
@@ -1706,22 +2083,18 @@
   Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
   Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
 
-  Packet4ui nan_selector = pandnot<Packet4ui>(
-      reinterpret_cast<Packet4ui>(is_max_exp),
-      reinterpret_cast<Packet4ui>(is_mant_zero)
-  );
+  Packet4ui nan_selector =
+      pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
 
   Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
 
-  Packet4ui subnormal_selector = pandnot<Packet4ui>(
-      reinterpret_cast<Packet4ui>(is_zero_exp),
-      reinterpret_cast<Packet4ui>(is_mant_zero)
-  );
+  Packet4ui subnormal_selector =
+      pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
 
   input = vec_sel(input, p4ui_nan, nan_selector);
   input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
 #else
-  //Test only NaN
+  // Test only NaN
   Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
 
   input = vec_sel(p4ui_nan, input, nan_selector);
@@ -1739,9 +2112,8 @@
  *
  * @param lohi to expect either a low & high OR odd & even order
  */
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
   if (lohi) {
     return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
   } else {
@@ -1754,9 +2126,8 @@
  *
  * @param lohi to expect either a low & high OR odd & even order
  */
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
   if (lohi) {
     return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
   } else {
@@ -1764,9 +2135,8 @@
   }
 }
 #else
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
   if (lohi) {
     return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
   } else {
@@ -1774,9 +2144,8 @@
   }
 }
 
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
   if (lohi) {
     return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
   } else {
@@ -1790,14 +2159,13 @@
  *
  * @param lohi to expect either a low & high OR odd & even order
  */
-template<bool lohi = true>
-EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi)
-{
+template <bool lohi = true>
+EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
   Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
   Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
 
   Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
-  EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS,0x7FFFu);
+  EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
   lsb = padd<Packet8us>(lsb, p8us_BIAS);
   lsb = padd<Packet8us>(lsb, p4f2);
 
@@ -1807,20 +2175,22 @@
 #ifdef _ARCH_PWR9
   Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
   Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
-  Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+  Packet8us nan_selector =
+      Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
 
   input = vec_sel(input, p8us_BIAS, nan_selector);
 
 #ifdef SUPPORT_BF16_SUBNORMALS
   Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
   Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
-  Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo), reinterpret_cast<Packet4f>(subnormal_selector_hi));
+  Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
+                                                   reinterpret_cast<Packet4f>(subnormal_selector_hi));
 
   input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
 #endif
 #else
 #ifdef SUPPORT_BF16_SUBNORMALS
-  //Test NaN and Subnormal
+  // Test NaN and Subnormal
   const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
   Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
 
@@ -1830,26 +2200,23 @@
   Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
   Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
 
-  Packet8us nan_selector = pandnot<Packet8us>(
-      reinterpret_cast<Packet8us>(is_max_exp),
-      reinterpret_cast<Packet8us>(is_mant_zero)
-  );
+  Packet8us nan_selector =
+      pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
 
   Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
 
-  Packet8us subnormal_selector = pandnot<Packet8us>(
-      reinterpret_cast<Packet8us>(is_zero_exp),
-      reinterpret_cast<Packet8us>(is_mant_zero)
-  );
+  Packet8us subnormal_selector =
+      pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
 
   // Using BIAS as NaN (since any or all of the last 7 bits can be set)
   input = vec_sel(input, p8us_BIAS, nan_selector);
   input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
 #else
-  //Test only NaN
+  // Test only NaN
   Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
   Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
-  Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+  Packet8us nan_selector =
+      Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
 
   input = vec_sel(p8us_BIAS, input, nan_selector);
 #endif
@@ -1861,8 +2228,7 @@
 /**
  * Convert and pack two float Packets into one bfloat16 Packet - low & high order
  */
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi)
-{
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
 #ifdef _ARCH_PWR10
   Packet8bf fp16_0 = F32ToBf16(lo);
   Packet8bf fp16_1 = F32ToBf16(hi);
@@ -1875,7 +2241,7 @@
 /**
  * Convert and pack two float Packets into one bfloat16 Packet - odd & even order
  */
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
 #ifdef _ARCH_PWR10
   return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
 #else
@@ -1883,66 +2249,76 @@
 #endif
 }
 #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
-  Packet4f a_even = Bf16ToF32Even(A);\
-  Packet4f a_odd = Bf16ToF32Odd(A);\
-  Packet4f op_even = OP(a_even);\
-  Packet4f op_odd = OP(a_odd);\
-  return F32ToBf16(op_even, op_odd);\
+  Packet4f a_even = Bf16ToF32Even(A);       \
+  Packet4f a_odd = Bf16ToF32Odd(A);         \
+  Packet4f op_even = OP(a_even);            \
+  Packet4f op_odd = OP(a_odd);              \
+  return F32ToBf16(op_even, op_odd);
 
 #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
-  Packet4f a_even = Bf16ToF32Even(A);\
-  Packet4f a_odd = Bf16ToF32Odd(A);\
-  Packet4f b_even = Bf16ToF32Even(B);\
-  Packet4f b_odd = Bf16ToF32Odd(B);\
-  Packet4f op_even = OP(a_even, b_even);\
-  Packet4f op_odd = OP(a_odd, b_odd);\
-  return F32ToBf16(op_even, op_odd);\
+  Packet4f a_even = Bf16ToF32Even(A);           \
+  Packet4f a_odd = Bf16ToF32Odd(A);             \
+  Packet4f b_even = Bf16ToF32Even(B);           \
+  Packet4f b_odd = Bf16ToF32Odd(B);             \
+  Packet4f op_even = OP(a_even, b_even);        \
+  Packet4f op_odd = OP(a_odd, b_odd);           \
+  return F32ToBf16(op_even, op_odd);
 
 #define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
-  Packet4f a_even = Bf16ToF32Even(A);\
-  Packet4f a_odd = Bf16ToF32Odd(A);\
-  Packet4f b_even = Bf16ToF32Even(B);\
-  Packet4f b_odd = Bf16ToF32Odd(B);\
-  Packet4f op_even = OP(a_even, b_even);\
-  Packet4f op_odd = OP(a_odd, b_odd);\
-  return F32ToBf16Bool(op_even, op_odd);\
+  Packet4f a_even = Bf16ToF32Even(A);                \
+  Packet4f a_odd = Bf16ToF32Odd(A);                  \
+  Packet4f b_even = Bf16ToF32Even(B);                \
+  Packet4f b_odd = Bf16ToF32Odd(B);                  \
+  Packet4f op_even = OP(a_even, b_even);             \
+  Packet4f op_odd = OP(a_odd, b_odd);                \
+  return F32ToBf16Bool(op_even, op_odd);
 
-template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
-  EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask,0x8000);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+  EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
   return pxor<Packet8us>(p8us_neg_mask, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
   Packet4f a_even = Bf16ToF32Even(a);
   Packet4f a_odd = Bf16ToF32Odd(a);
   Packet4f e_even;
@@ -1953,30 +2329,38 @@
   return F32ToBf16(op_even, op_odd);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
 }
 #ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
   BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
 }
 #endif
-template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
   Packet4f a_even = Bf16ToF32Even(a);
   Packet4f a_odd = Bf16ToF32Odd(a);
   Packet4f b_even = Bf16ToF32Even(b);
@@ -1988,54 +2372,62 @@
   return F32ToBf16(pmadd_even, pmadd_odd);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
   return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const  bfloat16*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
   return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
-  bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
-                            bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+                           bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
   return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
 }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
   Packet4f b, sum;
-  b   = vec_sld(a, a, 8);
+  b = vec_sld(a, a, 8);
   sum = a + b;
-  b   = vec_sld(sum, sum, 4);
+  b = vec_sld(sum, sum, 4);
   sum += b;
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
   Packet4i sum;
   sum = vec_sums(a, p4i_ZERO);
 #ifdef _BIG_ENDIAN
@@ -2046,89 +2438,89 @@
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
   float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
-  float redux_odd  = predux<Packet4f>(Bf16ToF32Odd(a));
+  float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
   float f32_result = redux_even + redux_odd;
   return bfloat16(f32_result);
 }
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
-{
-  union{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
+  union {
     Packet v;
     __UNPACK_TYPE__(Packet) n[8];
   } vt;
   vt.v = a;
 
-  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  Packet4i first_half  = pload<Packet4i>(first_loader);
+  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+  Packet4i first_half = pload<Packet4i>(first_loader);
   Packet4i second_half = pload<Packet4i>(second_loader);
 
   return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
 }
 
-template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
   return predux_size8<Packet8s>(a);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
   return predux_size8<Packet8us>(a);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
-{
-  union{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
+  union {
     Packet v;
     __UNPACK_TYPE__(Packet) n[16];
   } vt;
   vt.v = a;
 
-  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
-  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
+  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+  EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
+  EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
 
   Packet4i first_quarter = pload<Packet4i>(first_loader);
   Packet4i second_quarter = pload<Packet4i>(second_loader);
   Packet4i third_quarter = pload<Packet4i>(third_loader);
   Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
 
-  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
-		                  + predux(third_quarter) + predux(fourth_quarter));
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
+                                              predux(fourth_quarter));
 }
 
-template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
   return predux_size16<Packet16c>(a);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
   return predux_size16<Packet16uc>(a);
 }
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
   Packet4f prod;
   prod = pmul(a, vec_sld(a, a, 8));
   return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
   Packet8s pair, quad, octo;
 
   pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2138,8 +2530,8 @@
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
   Packet8us pair, quad, octo;
 
   pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2149,17 +2541,16 @@
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
   float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
-  float redux_odd  = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+  float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
   float f32_result = redux_even * redux_odd;
   return bfloat16(f32_result);
 }
 
-
-template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
   Packet16c pair, quad, octo, result;
 
   pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2170,8 +2561,8 @@
   return pfirst(result);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
   Packet16uc pair, quad, octo, result;
 
   pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2183,66 +2574,64 @@
 }
 
 // min
-template<typename Packet> EIGEN_STRONG_INLINE
-__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
   Packet b, res;
   b = vec_min(a, vec_sld(a, a, 8));
   res = vec_min(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
-
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
   return predux_min4<Packet4f>(a);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
   return predux_min4<Packet4i>(a);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
   float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
-  float redux_odd  = predux_min<Packet4f>(Bf16ToF32Odd(a));
+  float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
   float f32_result = (std::min)(redux_even, redux_odd);
   return bfloat16(f32_result);
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
   Packet8s pair, quad, octo;
-  
-  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
-  pair = vec_min(a, vec_sld(a, a, 8)); 
 
-  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8));
+
+  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
   quad = vec_min(pair, vec_sld(pair, pair, 4));
 
-  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
   octo = vec_min(quad, vec_sld(quad, quad, 2));
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
   Packet8us pair, quad, octo;
-  
-  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
-  pair = vec_min(a, vec_sld(a, a, 8)); 
 
-  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8));
+
+  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
   quad = vec_min(pair, vec_sld(pair, pair, 4));
 
-  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
   octo = vec_min(quad, vec_sld(quad, quad, 2));
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
   Packet16c pair, quad, octo, result;
 
   pair = vec_min(a, vec_sld(a, a, 8));
@@ -2253,8 +2642,8 @@
   return pfirst(result);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
   Packet16uc pair, quad, octo, result;
 
   pair = vec_min(a, vec_sld(a, a, 8));
@@ -2265,64 +2654,64 @@
   return pfirst(result);
 }
 // max
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
   Packet b, res;
   b = vec_max(a, vec_sld(a, a, 8));
   res = vec_max(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
   return predux_max4<Packet4f>(a);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
   return predux_max4<Packet4i>(a);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
   float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
-  float redux_odd  = predux_max<Packet4f>(Bf16ToF32Odd(a));
+  float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
   float f32_result = (std::max)(redux_even, redux_odd);
   return bfloat16(f32_result);
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
   Packet8s pair, quad, octo;
-  
-  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
-  pair = vec_max(a, vec_sld(a, a, 8)); 
 
-  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8));
+
+  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
   quad = vec_max(pair, vec_sld(pair, pair, 4));
 
-  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
   octo = vec_max(quad, vec_sld(quad, quad, 2));
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
   Packet8us pair, quad, octo;
-  
-  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
-  pair = vec_max(a, vec_sld(a, a, 8)); 
 
-  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8));
+
+  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
   quad = vec_max(pair, vec_sld(pair, pair, 4));
 
-  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
   octo = vec_max(quad, vec_sld(quad, quad, 2));
   return pfirst(octo);
 }
 
-template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
   Packet16c pair, quad, octo, result;
 
   pair = vec_max(a, vec_sld(a, a, 8));
@@ -2333,8 +2722,8 @@
   return pfirst(result);
 }
 
-template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
   Packet16uc pair, quad, octo, result;
 
   pair = vec_max(a, vec_sld(a, a, 8));
@@ -2345,13 +2734,13 @@
   return pfirst(result);
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
   return vec_any_ne(x, pzero(x));
 }
 
-template <typename T> EIGEN_DEVICE_FUNC inline void
-ptranpose_common(PacketBlock<T,4>& kernel){
+template <typename T>
+EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
   T t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2363,18 +2752,11 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  ptranpose_common<Packet4f>(kernel);
-}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  ptranpose_common<Packet4i>(kernel);
-}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
   Packet8s t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2386,8 +2768,7 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
   Packet8us t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2399,9 +2780,7 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
   Packet8us t0, t1, t2, t3;
 
   t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
@@ -2414,8 +2793,7 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
   Packet16c t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2427,9 +2805,7 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
   Packet16uc t0, t1, t2, t3;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2441,8 +2817,7 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
   Packet8s v[8], sum[8];
 
   v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -2472,8 +2847,7 @@
   kernel.packet[7] = vec_mergel(sum[3], sum[7]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
   Packet8us v[8], sum[8];
 
   v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -2503,8 +2877,7 @@
   kernel.packet[7] = vec_mergel(sum[3], sum[7]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
   Packet8bf v[8], sum[8];
 
   v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
@@ -2534,8 +2907,7 @@
   kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
   Packet16c step1[16], step2[16], step3[16];
 
   step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -2555,16 +2927,16 @@
   step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
   step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
 
-  step2[0]  = vec_mergeh(step1[0], step1[8]);
-  step2[1]  = vec_mergel(step1[0], step1[8]);
-  step2[2]  = vec_mergeh(step1[1], step1[9]);
-  step2[3]  = vec_mergel(step1[1], step1[9]);
-  step2[4]  = vec_mergeh(step1[2], step1[10]);
-  step2[5]  = vec_mergel(step1[2], step1[10]);
-  step2[6]  = vec_mergeh(step1[3], step1[11]);
-  step2[7]  = vec_mergel(step1[3], step1[11]);
-  step2[8]  = vec_mergeh(step1[4], step1[12]);
-  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[0] = vec_mergeh(step1[0], step1[8]);
+  step2[1] = vec_mergel(step1[0], step1[8]);
+  step2[2] = vec_mergeh(step1[1], step1[9]);
+  step2[3] = vec_mergel(step1[1], step1[9]);
+  step2[4] = vec_mergeh(step1[2], step1[10]);
+  step2[5] = vec_mergel(step1[2], step1[10]);
+  step2[6] = vec_mergeh(step1[3], step1[11]);
+  step2[7] = vec_mergel(step1[3], step1[11]);
+  step2[8] = vec_mergeh(step1[4], step1[12]);
+  step2[9] = vec_mergel(step1[4], step1[12]);
   step2[10] = vec_mergeh(step1[5], step1[13]);
   step2[11] = vec_mergel(step1[5], step1[13]);
   step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -2572,16 +2944,16 @@
   step2[14] = vec_mergeh(step1[7], step1[15]);
   step2[15] = vec_mergel(step1[7], step1[15]);
 
-  step3[0]  = vec_mergeh(step2[0], step2[8]);
-  step3[1]  = vec_mergel(step2[0], step2[8]);
-  step3[2]  = vec_mergeh(step2[1], step2[9]);
-  step3[3]  = vec_mergel(step2[1], step2[9]);
-  step3[4]  = vec_mergeh(step2[2], step2[10]);
-  step3[5]  = vec_mergel(step2[2], step2[10]);
-  step3[6]  = vec_mergeh(step2[3], step2[11]);
-  step3[7]  = vec_mergel(step2[3], step2[11]);
-  step3[8]  = vec_mergeh(step2[4], step2[12]);
-  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[0] = vec_mergeh(step2[0], step2[8]);
+  step3[1] = vec_mergel(step2[0], step2[8]);
+  step3[2] = vec_mergeh(step2[1], step2[9]);
+  step3[3] = vec_mergel(step2[1], step2[9]);
+  step3[4] = vec_mergeh(step2[2], step2[10]);
+  step3[5] = vec_mergel(step2[2], step2[10]);
+  step3[6] = vec_mergeh(step2[3], step2[11]);
+  step3[7] = vec_mergel(step2[3], step2[11]);
+  step3[8] = vec_mergeh(step2[4], step2[12]);
+  step3[9] = vec_mergel(step2[4], step2[12]);
   step3[10] = vec_mergeh(step2[5], step2[13]);
   step3[11] = vec_mergel(step2[5], step2[13]);
   step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2589,16 +2961,16 @@
   step3[14] = vec_mergeh(step2[7], step2[15]);
   step3[15] = vec_mergel(step2[7], step2[15]);
 
-  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
-  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
-  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
-  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
-  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
-  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
-  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
-  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
-  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
-  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
   kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
   kernel.packet[11] = vec_mergel(step3[5], step3[13]);
   kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2607,8 +2979,7 @@
   kernel.packet[15] = vec_mergel(step3[7], step3[15]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
   Packet16uc step1[16], step2[16], step3[16];
 
   step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -2628,16 +2999,16 @@
   step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
   step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
 
-  step2[0]  = vec_mergeh(step1[0], step1[8]);
-  step2[1]  = vec_mergel(step1[0], step1[8]);
-  step2[2]  = vec_mergeh(step1[1], step1[9]);
-  step2[3]  = vec_mergel(step1[1], step1[9]);
-  step2[4]  = vec_mergeh(step1[2], step1[10]);
-  step2[5]  = vec_mergel(step1[2], step1[10]);
-  step2[6]  = vec_mergeh(step1[3], step1[11]);
-  step2[7]  = vec_mergel(step1[3], step1[11]);
-  step2[8]  = vec_mergeh(step1[4], step1[12]);
-  step2[9]  = vec_mergel(step1[4], step1[12]);
+  step2[0] = vec_mergeh(step1[0], step1[8]);
+  step2[1] = vec_mergel(step1[0], step1[8]);
+  step2[2] = vec_mergeh(step1[1], step1[9]);
+  step2[3] = vec_mergel(step1[1], step1[9]);
+  step2[4] = vec_mergeh(step1[2], step1[10]);
+  step2[5] = vec_mergel(step1[2], step1[10]);
+  step2[6] = vec_mergeh(step1[3], step1[11]);
+  step2[7] = vec_mergel(step1[3], step1[11]);
+  step2[8] = vec_mergeh(step1[4], step1[12]);
+  step2[9] = vec_mergel(step1[4], step1[12]);
   step2[10] = vec_mergeh(step1[5], step1[13]);
   step2[11] = vec_mergel(step1[5], step1[13]);
   step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -2645,16 +3016,16 @@
   step2[14] = vec_mergeh(step1[7], step1[15]);
   step2[15] = vec_mergel(step1[7], step1[15]);
 
-  step3[0]  = vec_mergeh(step2[0], step2[8]);
-  step3[1]  = vec_mergel(step2[0], step2[8]);
-  step3[2]  = vec_mergeh(step2[1], step2[9]);
-  step3[3]  = vec_mergel(step2[1], step2[9]);
-  step3[4]  = vec_mergeh(step2[2], step2[10]);
-  step3[5]  = vec_mergel(step2[2], step2[10]);
-  step3[6]  = vec_mergeh(step2[3], step2[11]);
-  step3[7]  = vec_mergel(step2[3], step2[11]);
-  step3[8]  = vec_mergeh(step2[4], step2[12]);
-  step3[9]  = vec_mergel(step2[4], step2[12]);
+  step3[0] = vec_mergeh(step2[0], step2[8]);
+  step3[1] = vec_mergel(step2[0], step2[8]);
+  step3[2] = vec_mergeh(step2[1], step2[9]);
+  step3[3] = vec_mergel(step2[1], step2[9]);
+  step3[4] = vec_mergeh(step2[2], step2[10]);
+  step3[5] = vec_mergel(step2[2], step2[10]);
+  step3[6] = vec_mergeh(step2[3], step2[11]);
+  step3[7] = vec_mergel(step2[3], step2[11]);
+  step3[8] = vec_mergeh(step2[4], step2[12]);
+  step3[9] = vec_mergel(step2[4], step2[12]);
   step3[10] = vec_mergeh(step2[5], step2[13]);
   step3[11] = vec_mergel(step2[5], step2[13]);
   step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2662,16 +3033,16 @@
   step3[14] = vec_mergeh(step2[7], step2[15]);
   step3[15] = vec_mergel(step2[7], step2[15]);
 
-  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
-  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
-  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
-  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
-  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
-  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
-  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
-  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
-  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
-  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
+  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
   kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
   kernel.packet[11] = vec_mergel(step3[5], step3[13]);
   kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2680,112 +3051,127 @@
   kernel.packet[15] = vec_mergel(step3[7], step3[15]);
 }
 
-template<typename Packet> EIGEN_STRONG_INLINE
-Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
   Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
   return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
   return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
-  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+template <>
+EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
+                                    const Packet8s& elsePacket) {
+  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
   Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
   Packet8s result = vec_sel(elsePacket, thenPacket, mask);
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
-  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+template <>
+EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
+                                     const Packet8us& elsePacket) {
+  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
   Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
+                                     const Packet8bf& elsePacket) {
   return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
-  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
-                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+template <>
+EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
+                                     const Packet16c& elsePacket) {
+  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
+                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
+                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
 
   Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
-  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
-                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+template <>
+EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
+                                      const Packet16uc& elsePacket) {
+  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
+                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
+                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
 
   Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-
 //---------- double ----------
 #ifdef EIGEN_VECTORIZE_VSX
-typedef __vector double              Packet2d;
-typedef __vector unsigned long long  Packet2ul;
-typedef __vector long long           Packet2l;
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
 #if EIGEN_COMP_CLANG
-typedef Packet2ul                    Packet2bl;
+typedef Packet2ul Packet2bl;
 #else
-typedef __vector __bool long         Packet2bl;
+typedef __vector __bool long Packet2bl;
 #endif
 
-static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
-static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
-static Packet2d  p2d_ONE  = { 1.0, 1.0 };
-static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d  p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
-                               numext::bit_cast<double>(0x8000000000000000ull) };
+static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
+static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
+                             numext::bit_cast<double>(0x8000000000000000ull)};
 
 #ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+static Packet2d p2d_COUNTDOWN =
+    reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
 #else
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+static Packet2d p2d_COUNTDOWN =
+    reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
 #endif
 
-template<int index> Packet2d vec_splat_dbl(Packet2d& a)
-{
+template <int index>
+Packet2d vec_splat_dbl(Packet2d& a) {
   return vec_splat(a, index);
 }
 
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
   typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=2,
+    size = 2,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 1,
-    HasMin  = 1,
-    HasMax  = 1,
-    HasAbs  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
     HasATan = 0,
-    HasLog  = 0,
-    HasExp  = 1,
+    HasLog = 0,
+    HasExp = 1,
     HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
@@ -2801,12 +3187,22 @@
   };
 };
 
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2d half;
+};
 
-inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
   union {
-    Packet2l   v;
+    Packet2l v;
     int64_t n[2];
   } vt;
   vt.v = v;
@@ -2814,10 +3210,9 @@
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
   union {
-    Packet2d   v;
+    Packet2d v;
     double n[2];
   } vt;
   vt.v = v;
@@ -2826,74 +3221,86 @@
 }
 
 // Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
   EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+  return vec_xl(0, const_cast<double*>(from));  // cast needed by Clang
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
   return pload_partial_common<Packet2d>(from, n, offset);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
   EIGEN_DEBUG_ALIGNED_STORE
   vec_xst(from, 0, to);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double*  to, const Packet2d& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
   pstore_partial_common<Packet2d>(to, from, n, offset);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   Packet2d v = {from, from};
   return v;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
   Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
   return reinterpret_cast<Packet2d>(v);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
-                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
-  //This way is faster than vec_splat (at least for doubles in Power 9)
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
+  // This way is faster than vec_splat (at least for doubles in Power 9)
   a0 = pset1<Packet2d>(a[0]);
   a1 = pset1<Packet2d>(a[1]);
   a2 = pset1<Packet2d>(a[2]);
   a3 = pset1<Packet2d>(a[3]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
   return pgather_common<Packet2d>(from, stride);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
+                                                                                 const Index n) {
   return pgather_common<Packet2d>(from, stride, n);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
   pscatter_common<Packet2d>(to, from, stride);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
+                                                                              Index stride, const Index n) {
   pscatter_common<Packet2d>(to, from, stride, n);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return pset1<Packet2d>(a) + p2d_COUNTDOWN;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return a + b;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return a - b;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
 #ifdef __POWER8_VECTOR__
   return vec_neg(a);
 #else
@@ -2901,150 +3308,214 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_madd(a, b, p2d_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_div(a, b);
+}
 
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_msub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmsub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmadd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_nmadd(a, b, c);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
   // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet2d ret;
-  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
   return ret;
- }
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
   // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet2d ret;
-  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
   return ret;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
-  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
-  return vec_nor(c,c);
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmple(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
+  return vec_nor(c, c);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
-{
-    Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
-    Packet2d res;
-
-    __asm__("xvrdpiz %x0, %x1\n\t"
-        : "=&wa" (res)
-        : "wa" (t));
-
-    return res;
-}
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
-{
-    Packet2d res;
-
-    __asm__("xvrdpic %x0, %x1\n\t"
-        : "=&wa" (res)
-        : "wa" (a));
-
-    return res;
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_or(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_xor(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, vec_nor(b, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d t = vec_add(
+      reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+  Packet2d res;
+
+  __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
+
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  Packet2d res;
+
+  __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
+
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return vec_xl(0, const_cast<double*>(from));
 }
 
-template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
   return ploadu_partial_common<Packet2d>(from, n, offset);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
   Packet2d p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
-  else                                  p = ploadu<Packet2d>(from);
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet2d>(from);
+  else
+    p = ploadu<Packet2d>(from);
   return vec_splat_dbl<0>(p);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
   EIGEN_DEBUG_UNALIGNED_STORE
   vec_xst(from, 0, to);
 }
 
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double*  to, const Packet2d& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
   pstoreu_partial_common<Packet2d>(to, from, n, offset);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_ALIGN16 double x[2];
+  pstore<double>(x, a);
+  return x[0];
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
   return vec_sld(a, a, 8);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return vec_abs(a);
+}
 #ifdef __POWER8_VECTOR__
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d&  a) { return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
+}
 #else
 #ifdef _BIG_ENDIAN
-static Packet16uc p16uc_DUPSIGN = { 0,0,0,0, 0,0,0,0, 8,8,8,8, 8,8,8,8 };
+static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
 #else
-static Packet16uc p16uc_DUPSIGN = { 7,7,7,7, 7,7,7,7, 15,15,15,15, 15,15,15,15 };
+static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d&  a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
   Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
   return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
 }
 #endif
 
-template<> inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
+template <>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
 
-template<> inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
+template <>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
 
 // Packet2l shifts.
-// For POWER8 we simply use vec_sr/l. 
+// For POWER8 we simply use vec_sr/l.
 //
 // Things are more complicated for POWER7. There is actually a
 // vec_xxsxdi intrinsic but it is not supported by some gcc versions.
 // So we need to shift by N % 32 and rearrage bytes.
 #ifdef __POWER8_VECTOR__
 
-template<int N>
+template <int N>
 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
-  const Packet2ul shift = { N, N };
-  return vec_sl(a, shift); 
+  const Packet2ul shift = {N, N};
+  return vec_sl(a, shift);
 }
 
-template<int N>
+template <int N>
 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
-  const Packet2ul shift = { N, N };
-  return vec_sr(a, shift); 
+  const Packet2ul shift = {N, N};
+  return vec_sr(a, shift);
 }
 
 #else
@@ -3052,34 +3523,32 @@
 // Shifts [A, B, C, D] to [B, 0, D, 0].
 // Used to implement left shifts for Packet2l.
 EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
-  static const Packet16uc perm = {
-      0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
-      0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-  #ifdef  _BIG_ENDIAN
-    return vec_perm(p4i_ZERO, a, perm);
-  #else
-    return vec_perm(a, p4i_ZERO, perm);
-  #endif
+  static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                                  0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
+#ifdef _BIG_ENDIAN
+  return vec_perm(p4i_ZERO, a, perm);
+#else
+  return vec_perm(a, p4i_ZERO, perm);
+#endif
 }
 
 // Shifts [A, B, C, D] to [0, A, 0, C].
 // Used to implement right shifts for Packet2l.
 EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
-  static const Packet16uc perm = {
-      0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 
-      0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
-  #ifdef  _BIG_ENDIAN
-    return vec_perm(p4i_ZERO, a, perm);
-  #else
-    return vec_perm(a, p4i_ZERO, perm);
-  #endif
+  static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                  0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
+#ifdef _BIG_ENDIAN
+  return vec_perm(p4i_ZERO, a, perm);
+#else
+  return vec_perm(a, p4i_ZERO, perm);
+#endif
 }
 
-template<int N, typename EnableIf = void>
+template <int N, typename EnableIf = void>
 struct plogical_shift_left_impl;
 
-template<int N>
-struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
   static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
     static const unsigned n = static_cast<unsigned>(N);
     const Packet4ui shift = {n, n, n, n};
@@ -3092,8 +3561,8 @@
   }
 };
 
-template<int N>
-struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)>> {
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
   static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
     static const unsigned m = static_cast<unsigned>(N - 32);
     const Packet4ui shift = {m, m, m, m};
@@ -3102,16 +3571,16 @@
   }
 };
 
-template<int N>
+template <int N>
 EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
-  return plogical_shift_left_impl<N>::run(a); 
+  return plogical_shift_left_impl<N>::run(a);
 }
 
-template<int N, typename EnableIf = void>
+template <int N, typename EnableIf = void>
 struct plogical_shift_right_impl;
 
-template<int N>
-struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
   static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
     static const unsigned n = static_cast<unsigned>(N);
     const Packet4ui shift = {n, n, n, n};
@@ -3124,8 +3593,8 @@
   }
 };
 
-template<int N>
-struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)>> {
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
   static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
     static const unsigned m = static_cast<unsigned>(N - 32);
     const Packet4ui shift = {m, m, m, m};
@@ -3134,69 +3603,71 @@
   }
 };
 
-template<int N>
+template <int N>
 EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
-  return plogical_shift_right_impl<N>::run(a); 
+  return plogical_shift_right_impl<N>::run(a);
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   // Clamp exponent to [-2099, 2099]
   const Packet2d max_exponent = pset1<Packet2d>(2099.0);
   const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
 
   // Split 2^e into four factors and multiply:
-  const Packet2l  bias = { 1023, 1023 };
+  const Packet2l bias = {1023, 1023};
   Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
   Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
-  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
-  b = psub(psub(psub(e, b), b), b);  // e - 3b
-  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
-  out = pmul(out, c); // a * 2^e
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                        // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                   // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));  // 2^(e - 3b)
+  out = pmul(out, c);                                                 // a * 2^e
   return out;
 }
 
-
 // Extract exponent without existence of Packet2l.
-template<>
-EIGEN_STRONG_INLINE  
-Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
   return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
   Packet2d b, sum;
-  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
   sum = a + b;
   return pfirst<Packet2d>(sum);
 }
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
 // max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   Packet2d t0, t1;
   t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
   t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
@@ -3204,16 +3675,17 @@
   kernel.packet[1] = t1;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
-  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
   Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
+#endif  // __VSX__
+}  // end namespace internal
 
-#endif // __VSX__
-} // end namespace internal
+}  // end namespace Eigen
 
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_ALTIVEC_H
+#endif  // EIGEN_PACKET_MATH_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
index 361c69f..fdabeb9 100644
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@@ -19,57 +19,46 @@
 namespace internal {
 template <>
 struct type_casting_traits<float, int> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
 };
 
 template <>
 struct type_casting_traits<int, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
 };
 
 template <>
 struct type_casting_traits<bfloat16, unsigned short int> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
 };
 
 template <>
 struct type_casting_traits<unsigned short int, bfloat16> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return vec_cts(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vec_cts(a, 0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
-  return vec_ctu(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vec_ctu(a, 0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return vec_ctf(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vec_ctf(a, 0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
-  return vec_ctf(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vec_ctf(a, 0);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
   Packet4f float_even = Bf16ToF32Even(a);
   Packet4f float_odd = Bf16ToF32Odd(a);
   Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
@@ -78,13 +67,13 @@
   Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
   Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
 
-  //Check values that are bigger than USHRT_MAX (0xFFFF)
+  // Check values that are bigger than USHRT_MAX (0xFFFF)
   Packet4bi overflow_selector;
-  if(vec_any_gt(int_even, p4ui_low_mask)){
+  if (vec_any_gt(int_even, p4ui_low_mask)) {
     overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
     low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
   }
-  if(vec_any_gt(int_odd, p4ui_low_mask)){
+  if (vec_any_gt(int_odd, p4ui_low_mask)) {
     overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
     low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
   }
@@ -92,8 +81,9 @@
   return pmerge(low_even, low_odd);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
-  //short -> int -> float -> bfloat16
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+  // short -> int -> float -> bfloat16
   const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
   Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
   Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
@@ -105,14 +95,11 @@
 
 template <>
 struct type_casting_traits<bfloat16, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
 };
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet8bf, Packet4f>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8bf, Packet4f>(const Packet8bf& a) {
   Packet8us z = pset1<Packet8us>(0);
 #ifdef _BIG_ENDIAN
   return reinterpret_cast<Packet4f>(vec_mergeh(a.m_val, z));
@@ -123,22 +110,21 @@
 
 template <>
 struct type_casting_traits<float, bfloat16> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 2,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
 };
 
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet4f, Packet8bf>(const Packet4f& a, const Packet4f &b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet4f, Packet8bf>(const Packet4f& a, const Packet4f& b) {
   return F32ToBf16Both(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
   return reinterpret_cast<Packet4i>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
   return reinterpret_cast<Packet4f>(a);
 }
 
@@ -149,31 +135,29 @@
 // a slow version that works with older compilers.
 // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-template<>
+template <>
 inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0)
-  return vec_cts(x, 0);    // TODO: check clang version.
+#if EIGEN_GNUC_STRICT_AT_LEAST(7, 1, 0)
+  return vec_cts(x, 0);  // TODO: check clang version.
 #else
   double tmp[2];
   memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = { static_cast<long long>(tmp[0]),
-                 static_cast<long long>(tmp[1]) };
+  Packet2l l = {static_cast<long long>(tmp[0]), static_cast<long long>(tmp[1])};
   return l;
 #endif
 }
 
-template<>
+template <>
 inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
   unsigned long long tmp[2];
   memcpy(tmp, &x, sizeof(tmp));
-  Packet2d d = { static_cast<double>(tmp[0]),
-                 static_cast<double>(tmp[1]) };
+  Packet2d d = {static_cast<double>(tmp[0]), static_cast<double>(tmp[1])};
   return d;
 }
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_ALTIVEC_H
+#endif  // EIGEN_TYPE_CASTING_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 93e8714..68b48f9 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -26,16 +26,16 @@
 // As a consequence, we get compile failures when compiling Eigen with
 // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
 // Eigen with GPU support
-  #pragma push_macro("EIGEN_CONSTEXPR")
-  #undef EIGEN_CONSTEXPR
-  #define EIGEN_CONSTEXPR
+#pragma push_macro("EIGEN_CONSTEXPR")
+#undef EIGEN_CONSTEXPR
+#define EIGEN_CONSTEXPR
 #endif
 
-#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)         \
-  template <>                                                       \
-  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED  \
-  PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) {          \
-    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));              \
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)                                         \
+  template <>                                                                                       \
+  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED PACKET_BF16 METHOD<PACKET_BF16>( \
+      const PACKET_BF16& _x) {                                                                      \
+    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));                                              \
   }
 
 // Only use HIP GPU bf16 in kernels
@@ -77,7 +77,7 @@
   unsigned short value;
 };
 
-#endif // defined(EIGEN_USE_HIP_BF16)
+#endif  // defined(EIGEN_USE_HIP_BF16)
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
 template <bool AssumeArgumentIsNormalOrInfinityOrZero>
@@ -95,11 +95,10 @@
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
 };
 
-} // namespace bfloat16_impl
+}  // namespace bfloat16_impl
 
 // Class definition.
 struct bfloat16 : public bfloat16_impl::bfloat16_base {
-
   typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
@@ -109,16 +108,17 @@
   explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
       : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
 
-  template<class T>
+  template <class T>
   explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
-      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+      : bfloat16_impl::bfloat16_base(
+            bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
 
   explicit EIGEN_DEVICE_FUNC bfloat16(float f)
       : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
 
   // Following the convention of numpy, converting between complex and
   // float will lead to loss of imag value.
-  template<typename RealScalar>
+  template <typename RealScalar>
   explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
       : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
 
@@ -160,62 +160,64 @@
   // detect tininess in the same way for all operations in radix two"
   static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
 
-  static EIGEN_CONSTEXPR Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16(min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
   static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
-  static EIGEN_CONSTEXPR Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16(max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
   static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
   static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); }
   static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
   static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
-  static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() {
+    return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0);
+  }
   static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
 };
 
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_specialized;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_signed;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_integer;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_exact;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_infinity;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_quiet_NaN;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_signaling_NaN;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl<T>::has_denorm;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_denorm_loss;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl<T>::round_style;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_iec559;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_bounded;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_modulo;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_digits10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::radix;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::traps;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::tinyness_before;
 }  // end namespace bfloat16_impl
 }  // end namespace Eigen
@@ -225,13 +227,13 @@
 // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
 // std::numeric_limits<const volatile T>
 // https://stackoverflow.com/a/16519653/
-template<>
+template <>
 class numeric_limits<Eigen::bfloat16> : public Eigen::bfloat16_impl::numeric_limits_bfloat16_impl<> {};
-template<>
+template <>
 class numeric_limits<const Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
-template<>
+template <>
 class numeric_limits<volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
-template<>
+template <>
 class numeric_limits<const volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
 }  // end namespace std
 
@@ -242,7 +244,7 @@
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
-#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for bfloat16 floats
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
 // We need to provide emulated *host-side* BF16 operators for clang.
@@ -250,7 +252,7 @@
 #undef EIGEN_DEVICE_FUNC
 #if (defined(EIGEN_HAS_GPU_BF16) && defined(EIGEN_HAS_NATIVE_BF16))
 #define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
+#else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
 #endif
 #endif
@@ -258,41 +260,41 @@
 // Definitions for CPUs, mostly working through conversion
 // to/from fp32.
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
   return bfloat16(float(a) + float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const int& b) {
   return bfloat16(float(a) + static_cast<float>(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const int& a, const bfloat16& b) {
   return bfloat16(static_cast<float>(a) + float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
   return bfloat16(float(a) * float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
   return bfloat16(float(a) - float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
   return bfloat16(float(a) / float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a) {
   numext::uint16_t x = numext::bit_cast<uint16_t>(a) ^ 0x8000;
   return numext::bit_cast<bfloat16>(x);
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator+=(bfloat16& a, const bfloat16& b) {
   a = bfloat16(float(a) + float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator*=(bfloat16& a, const bfloat16& b) {
   a = bfloat16(float(a) * float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator-=(bfloat16& a, const bfloat16& b) {
   a = bfloat16(float(a) - float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator/=(bfloat16& a, const bfloat16& b) {
   a = bfloat16(float(a) / float(b));
   return a;
 }
@@ -314,22 +316,22 @@
   --a;
   return original_value;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
-  return numext::equal_strict(float(a),float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const bfloat16& a, const bfloat16& b) {
+  return numext::equal_strict(float(a), float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const bfloat16& a, const bfloat16& b) {
   return numext::not_equal_strict(float(a), float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const bfloat16& a, const bfloat16& b) {
   return float(a) < float(b);
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const bfloat16& a, const bfloat16& b) {
   return float(a) <= float(b);
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const bfloat16& a, const bfloat16& b) {
   return float(a) > float(b);
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const bfloat16& a, const bfloat16& b) {
   return float(a) >= float(b);
 }
 
@@ -340,7 +342,7 @@
 
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to bfloat16.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, Index b) {
   return bfloat16(static_cast<float>(a) / static_cast<float>(b));
 }
 
@@ -350,7 +352,7 @@
 #else
   __bfloat16_raw output;
   if (numext::isnan EIGEN_NOT_A_MACRO(v)) {
-    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
+    output.value = std::signbit(v) ? 0xFFC0 : 0x7FC0;
     return output;
   }
   output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
@@ -368,7 +370,8 @@
 #endif
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(
+    const __bfloat16_raw& bf) {
 #if defined(EIGEN_USE_HIP_BF16)
   return bf.data;
 #else
@@ -391,7 +394,7 @@
     //
     // qNaN magic: All exponent bits set + most significant bit of fraction
     // set.
-    output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
+    output.value = std::signbit(ff) ? 0xFFC0 : 0x7FC0;
   } else {
     // Fast rounding algorithm that rounds a half value to nearest even. This
     // reduces expected error when we convert a large number of floats. Here
@@ -555,140 +558,96 @@
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
 #if defined(EIGEN_USE_HIP_BF16)
-    return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
+  return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
 #else
-    numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
-    __bfloat16_raw output;
+  numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+  __bfloat16_raw output;
 
-    // Least significant bit of resulting bfloat.
-    numext::uint32_t lsb = (input >> 16) & 1;
-    numext::uint32_t rounding_bias = 0x7fff + lsb;
-    input += rounding_bias;
-    output.value = static_cast<numext::uint16_t>(input >> 16);
-    return output;
+  // Least significant bit of resulting bfloat.
+  numext::uint32_t lsb = (input >> 16) & 1;
+  numext::uint32_t rounding_bias = 0x7fff + lsb;
+  input += rounding_bias;
+  output.value = static_cast<numext::uint16_t>(input >> 16);
+  return output;
 #endif
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
 #if defined(EIGEN_USE_HIP_BF16)
-    return static_cast<float>(h);
+  return static_cast<float>(h);
 #else
-    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
+  return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
 #endif
 }
 
 // --- standard functions ---
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const bfloat16& a) {
   EIGEN_USING_STD(isinf);
 #if defined(EIGEN_USE_HIP_BF16)
-  return (isinf)(a); // Uses HIP hip_bfloat16 isinf operator
+  return (isinf)(a);  // Uses HIP hip_bfloat16 isinf operator
 #else
   return (isinf)(float(a));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const bfloat16& a) {
   EIGEN_USING_STD(isnan);
 #if defined(EIGEN_USE_HIP_BF16)
-  return (isnan)(a); // Uses HIP hip_bfloat16 isnan operator
+  return (isnan)(a);  // Uses HIP hip_bfloat16 isnan operator
 #else
   return (isnan)(float(a));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
-  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const bfloat16& a) {
+  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
   numext::uint16_t x = numext::bit_cast<numext::uint16_t>(a) & 0x7FFF;
   return numext::bit_cast<bfloat16>(x);
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
-  return bfloat16(numext::expm1(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
-  return bfloat16(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
-  return bfloat16(::log10f(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { return bfloat16(::expf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { return bfloat16(::logf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { return bfloat16(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { return bfloat16(::log10f(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
   return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { return bfloat16(::sqrtf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::powf(float(a), float(b)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan2(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::atan2f(float(a), float(b)));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
-  return bfloat16(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
-  return bfloat16(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
-  return bfloat16(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
-  return bfloat16(::asinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
-  return bfloat16(::acosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
-  return bfloat16(::atanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
-  return bfloat16(::sinhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
-  return bfloat16(::coshf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
-  return bfloat16(::asinhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
-  return bfloat16(::acoshf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
-  return bfloat16(::atanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
-  return bfloat16(::rintf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { return bfloat16(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { return bfloat16(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { return bfloat16(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { return bfloat16(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { return bfloat16(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { return bfloat16(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { return bfloat16(::sinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { return bfloat16(::coshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { return bfloat16(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { return bfloat16(::asinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { return bfloat16(::acoshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { return bfloat16(::atanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return bfloat16(::floorf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { return bfloat16(::ceilf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { return bfloat16(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { return bfloat16(::roundf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
   return bfloat16(::fmodf(float(a), float(b)));
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(min)(const bfloat16& a, const bfloat16& b) {
   const float f1 = static_cast<float>(a);
   const float f2 = static_cast<float>(b);
   return f2 < f1 ? b : a;
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(max)(const bfloat16& a, const bfloat16& b) {
   const float f1 = static_cast<float>(a);
   const float f2 = static_cast<float>(b);
   return f1 < f2 ? b : a;
@@ -707,42 +666,34 @@
 }
 
 #ifndef EIGEN_NO_IO
-EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
   os << static_cast<float>(v);
   return os;
 }
 #endif
 
-} // namespace bfloat16_impl
+}  // namespace bfloat16_impl
 
 namespace internal {
 
-template<>
-struct random_default_impl<bfloat16, false, false>
-{
-  static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
-  {
-    return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
+template <>
+struct random_default_impl<bfloat16, false, false> {
+  static inline bfloat16 run(const bfloat16& x, const bfloat16& y) {
+    return x + (y - x) * bfloat16(float(std::rand()) / float(RAND_MAX));
   }
-  static inline bfloat16 run()
-  {
-    return run(bfloat16(-1.f), bfloat16(1.f));
-  }
+  static inline bfloat16 run() { return run(bfloat16(-1.f), bfloat16(1.f)); }
 };
 
-template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
+template <>
+struct is_arithmetic<bfloat16> {
+  enum { value = true };
+};
 
-} // namespace internal
+}  // namespace internal
 
-template<> struct NumTraits<Eigen::bfloat16>
-    : GenericNumTraits<Eigen::bfloat16>
-{
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
+template <>
+struct NumTraits<Eigen::bfloat16> : GenericNumTraits<Eigen::bfloat16> {
+  enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
     return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
@@ -764,31 +715,27 @@
   }
 };
 
-} // namespace Eigen
-
+}  // namespace Eigen
 
 #if defined(EIGEN_HAS_HIP_BF16)
-  #pragma pop_macro("EIGEN_CONSTEXPR")
+#pragma pop_macro("EIGEN_CONSTEXPR")
 #endif
 
 namespace Eigen {
 namespace numext {
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isnan)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::bfloat16& h) {
   return (bfloat16_impl::isnan)(h);
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isinf)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::bfloat16& h) {
   return (bfloat16_impl::isinf)(h);
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isfinite)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::bfloat16& h) {
   return (bfloat16_impl::isfinite)(h);
 }
 
@@ -813,7 +760,7 @@
     return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
   }
 };
-} // namespace std
+}  // namespace std
 #endif
 
 // Add the missing shfl* intrinsics.
@@ -831,34 +778,39 @@
 
 #if defined(EIGEN_HAS_HIP_BF16)
 
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta,
+                                                         int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta,
+                                                           int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
-  return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(
+      static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
-  return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(
+      static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
 }
 
-#endif // HIP
+#endif  // HIP
 
-#endif // __shfl*
+#endif  // __shfl*
 
 #if defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::bfloat16 __ldg(const Eigen::bfloat16* ptr) {
-  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(__ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(
+      __ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
-#endif // __ldg
+#endif  // __ldg
 
-#endif // EIGEN_BFLOAT16_H
+#endif  // EIGEN_BFLOAT16_H
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
index 84da47f..fd7923e 100644
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -11,31 +11,25 @@
 #ifndef EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_ARCH_CONJ_HELPER_H
 
-#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
-  template <>                                                           \
-  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
-                                          const PACKET_CPLX& y,         \
-                                          const PACKET_CPLX& c) const { \
-      return padd(c, this->pmul(x, y));                                 \
-    }                                                                   \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
-                                         const PACKET_CPLX& y) const {  \
-      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
-    }                                                                   \
-  };                                                                    \
-                                                                        \
-  template <>                                                           \
-  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
-                                          const PACKET_REAL& y,         \
-                                          const PACKET_CPLX& c) const { \
-      return padd(c, this->pmul(x, y));                                 \
-    }                                                                   \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
-                                         const PACKET_REAL& y) const {  \
-      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
-    }                                                                   \
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                  \
+  template <>                                                                                                       \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {                                                      \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                                                             \
+    }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const {                        \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));                                               \
+    }                                                                                                               \
+  };                                                                                                                \
+                                                                                                                    \
+  template <>                                                                                                       \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {                                                      \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                                                             \
+    }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const {                        \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));                                               \
+    }                                                                                                               \
   };
 
 // IWYU pragma: private
@@ -44,74 +38,88 @@
 namespace Eigen {
 namespace internal {
 
-template<bool Conjugate> struct conj_if;
+template <bool Conjugate>
+struct conj_if;
 
-template<> struct conj_if<true> {
-  template<typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
-  template<typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+template <>
+struct conj_if<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return numext::conj(x);
+  }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const {
+    return internal::pconj(x);
+  }
 };
 
-template<> struct conj_if<false> {
-  template<typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
-  template<typename T>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+template <>
+struct conj_if<false> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const {
+    return x;
+  }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const {
+    return x;
+  }
 };
 
 // Generic Implementation, assume scalars since the packet-version is
 // specialized below.
-template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+template <typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
 struct conj_helper {
   typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
-  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
-  { return this->pmul(x, y) + c; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y,
+                                                         const ResultType& c) const {
+    return this->pmul(x, y) + c;
+  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
-  pmul(const LhsType& x, const RhsType& y) const
-  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const {
+    return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y);
+  }
 };
 
-template<typename LhsScalar, typename RhsScalar>
+template <typename LhsScalar, typename RhsScalar>
 struct conj_helper<LhsScalar, RhsScalar, true, true> {
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResultType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
-  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
-  { return this->pmul(x, y) + c; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsScalar& x, const RhsScalar& y,
+                                                         const ResultType& c) const {
+    return this->pmul(x, y) + c;
+  }
 
   // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
-  pmul(const LhsScalar& x, const RhsScalar& y) const
-  { return numext::conj(x * y); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsScalar& x, const RhsScalar& y) const {
+    return numext::conj(x * y);
+  }
 };
 
 // Implementation with equal type, use packet operations.
-template<typename Packet, bool ConjLhs, bool ConjRhs>
-struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
-{
+template <typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs> {
   typedef Packet ResultType;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
-  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
+  }
 
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
-  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+    return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y));
+  }
 };
 
-template<typename Packet>
-struct conj_helper<Packet, Packet, true, true>
-{
+template <typename Packet>
+struct conj_helper<Packet, Packet, true, true> {
   typedef Packet ResultType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
-  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmadd(pconj(x), pconj(y), c);
+  }
   // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
-  { return pconj(Eigen::internal::pmul(x, y)); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+    return pconj(Eigen::internal::pmul(x, y));
+  }
 };
 
 }  // namespace internal
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 3d4a2a5..8fb5b68 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -23,14 +23,27 @@
 namespace internal {
 
 // Creates a Scalar integer type with same bit-width.
-template<typename T> struct make_integer;
-template<> struct make_integer<float>    { typedef numext::int32_t type; };
-template<> struct make_integer<double>   { typedef numext::int64_t type; };
-template<> struct make_integer<half>     { typedef numext::int16_t type; };
-template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
+template <typename T>
+struct make_integer;
+template <>
+struct make_integer<float> {
+  typedef numext::int32_t type;
+};
+template <>
+struct make_integer<double> {
+  typedef numext::int64_t type;
+};
+template <>
+struct make_integer<half> {
+  typedef numext::int16_t type;
+};
+template <>
+struct make_integer<bfloat16> {
+  typedef numext::int16_t type;
+};
 
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
@@ -39,34 +52,32 @@
 
 // Safely applies frexp, correctly handles denormals.
 // Assumes IEEE floating point format.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
-  static constexpr int
-    TotalBits = sizeof(Scalar) * CHAR_BIT,
-    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-    ExponentBits = TotalBits - MantissaBits - 1;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
 
   EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
-      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
+      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
   const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
   const Packet half = pset1<Packet>(Scalar(0.5));
   const Packet zero = pzero(a);
-  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
 
   // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
   const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
-  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
+  EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
   // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
-  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
   const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
   const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
 
   // Determine exponent offset: -126 if normal, -126-24 if denormal
-  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(ExponentBits-1)) - ScalarUI(2)); // -126
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
   Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
-  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
   exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
 
   // Determine exponent and mantissa from normalized_a.
@@ -83,8 +94,8 @@
 
 // Safely applies ldexp, correctly handles overflows, underflows and denormals.
 // Assumes IEEE floating point format.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
   // We want to return a * 2^exponent, allowing for all possible integer
   // exponents without overflowing or underflowing in intermediate
   // computations.
@@ -93,7 +104,7 @@
   // to consider for a float is:
   //   -255-23 -> 255+23
   // Below -278 any finite float 'a' will become zero, and above +278 any
-  // finite float will become inf, including when 'a' is the smallest possible 
+  // finite float will become inf, including when 'a' is the smallest possible
   // denormal.
   //
   // Unfortunately, 2^(278) cannot be represented using either one or two
@@ -110,19 +121,17 @@
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int
-    TotalBits = sizeof(Scalar) * CHAR_BIT,
-    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-    ExponentBits = TotalBits - MantissaBits - 1;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
 
-  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
-  const PacketI bias = pset1<PacketI>((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1));  // 127
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
   const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
-  PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
   Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
-  Packet out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
-  b = psub(psub(psub(e, b), b), b); // e - 3b
-  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^(e-3*b)
+  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                                   // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
   out = pmul(out, c);
   return out;
 }
@@ -136,22 +145,19 @@
 // if 2^e doesn't fit into a normal floating-point Scalar.
 //
 // Assumes IEEE floating point format
-template<typename Packet>
+template <typename Packet>
 struct pldexp_fast_impl {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename unpacket_traits<PacketI>::type ScalarI;
-  static constexpr int
-    TotalBits = sizeof(Scalar) * CHAR_BIT,
-    MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
-    ExponentBits = TotalBits - MantissaBits - 1;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
 
-  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-  Packet run(const Packet& a, const Packet& exponent) {
-    const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1)));  // 127
-    const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) - ScalarI(1)));     // 255
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) {
+    const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+    const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
     // restrict biased exponent between 0 and 255 for float.
-    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+    const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
     // return a * (2^e)
     return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
   }
@@ -164,17 +170,15 @@
 // TODO(gonnet): Further reduce the interval allowing for lower-degree
 //               polynomial interpolants -> ... -> profit!
 template <typename Packet, bool base2>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_impl_float(const Packet _x)
-{
-  const Packet cst_1              = pset1<Packet>(1.0f);
-  const Packet cst_minus_inf      = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
-  const Packet cst_pos_inf        = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
+  const Packet cst_1 = pset1<Packet>(1.0f);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
 
   const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
   Packet e, x;
   // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(_x,e);
+  x = pfrexp(_x, e);
 
   // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
   // and shift by -1. The values are then centered around 0, which improves
@@ -216,27 +220,22 @@
   }
 
   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
-  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
-  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
   // Filter out invalid inputs, i.e.:
   //  - negative arg will be NAN
   //  - 0 will be -INF
   //  - +INF will be +INF
-  return pselect(iszero_mask, cst_minus_inf,
-                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_float(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
   return plog_impl_float<Packet, /* base2 */ false>(_x);
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_float(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
   return plog_impl_float<Packet, /* base2 */ true>(_x);
 }
 
@@ -250,19 +249,16 @@
  * for more detail see: http://www.netlib.org/cephes/
  */
 template <typename Packet, bool base2>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_impl_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
   Packet x = _x;
 
-  const Packet cst_1              = pset1<Packet>(1.0);
-  const Packet cst_neg_half       = pset1<Packet>(-0.5);
-  const Packet cst_minus_inf      = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
-  const Packet cst_pos_inf        = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
+  const Packet cst_1 = pset1<Packet>(1.0);
+  const Packet cst_neg_half = pset1<Packet>(-0.5);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
 
-
- // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
- //                             1/sqrt(2) <= x < sqrt(2)
+  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+  //                             1/sqrt(2) <= x < sqrt(2)
   const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
   const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
   const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
@@ -280,8 +276,8 @@
 
   Packet e;
   // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(x,e);
-  
+  x = pfrexp(x, e);
+
   // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
   // and shift by -1. The values are then centered around 0, which improves
   // the stability of the polynomial evaluation.
@@ -301,20 +297,20 @@
   // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
   // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
   Packet y, y1, y_;
-  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
   y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y  = pmadd(y, x, cst_cephes_log_p2);
+  y = pmadd(y, x, cst_cephes_log_p2);
   y1 = pmadd(y1, x, cst_cephes_log_p5);
   y_ = pmadd(y, x3, y1);
 
-  y  = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+  y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
   y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
-  y  = pmadd(y, x, cst_cephes_log_q2);
+  y = pmadd(y, x, cst_cephes_log_q2);
   y1 = pmadd(y1, x, cst_cephes_log_q5);
-  y  = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y1);
 
   y_ = pmul(y_, x3);
-  y  = pdiv(y_, y);
+  y = pdiv(y_, y);
 
   y = pmadd(cst_neg_half, x2, y);
   x = padd(x, y);
@@ -329,36 +325,30 @@
   }
 
   Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
-  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
-  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
   // Filter out invalid inputs, i.e.:
   //  - negative arg will be NAN
   //  - 0 will be -INF
   //  - +INF will be +INF
-  return pselect(iszero_mask, cst_minus_inf,
-                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
   return plog_impl_double<Packet, /* base2 */ false>(_x);
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
   return plog_impl_double<Packet, /* base2 */ true>(_x);
 }
 
 /** \internal \returns log(1 + x) computed using W. Kahan's formula.
     See: http://www.plunk.org/~hatch/rightway.php
  */
-template<typename Packet>
-Packet generic_plog1p(const Packet& x)
-{
+template <typename Packet>
+Packet generic_plog1p(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type ScalarType;
   const Packet one = pset1<Packet>(ScalarType(1));
   Packet xp1 = padd(x, one);
@@ -372,9 +362,8 @@
 /** \internal \returns exp(x)-1 computed using W. Kahan's formula.
     See: http://www.plunk.org/~hatch/rightway.php
  */
-template<typename Packet>
-Packet generic_expm1(const Packet& x)
-{
+template <typename Packet>
+Packet generic_expm1(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type ScalarType;
   const Packet one = pset1<Packet>(ScalarType(1));
   const Packet neg_one = pset1<Packet>(ScalarType(-1));
@@ -390,25 +379,18 @@
   Packet pos_inf_mask = pcmp_eq(logu, u);
   Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
   expm1 = pselect(pos_inf_mask, u, expm1);
-  return pselect(one_mask,
-                 x,
-                 pselect(neg_one_mask,
-                         neg_one,
-                         expm1));
+  return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
 }
 
-
 // Exponential function. Works by writing "x = m*log(2) + r" where
 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
 // exp(r) is computed using a 6th order minimax polynomial approximation.
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_float(const Packet _x)
-{
-  const Packet cst_zero   = pset1<Packet>(0.0f);
-  const Packet cst_one    = pset1<Packet>(1.0f);
-  const Packet cst_half   = pset1<Packet>(0.5f);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
+  const Packet cst_zero = pset1<Packet>(0.0f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_half = pset1<Packet>(0.5f);
   const Packet cst_exp_hi = pset1<Packet>(88.723f);
   const Packet cst_exp_lo = pset1<Packet>(-104.f);
 
@@ -447,13 +429,11 @@
 
   // Return 2^m * exp(r).
   // TODO: replace pldexp with faster implementation since y in [-1, 1).
-  return pselect(zero_mask, cst_zero, pmax(pldexp(y,m), _x));
+  return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
 }
 
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
   Packet x = _x;
   const Packet cst_zero = pset1<Packet>(0.0);
   const Packet cst_1 = pset1<Packet>(1.0);
@@ -516,7 +496,7 @@
   // Construct the result 2^n * exp(g) = e * x. The max is used to catch
   // non-finite values in the input.
   // TODO: replace pldexp with faster implementation since x in [-1, 1).
-  return pselect(zero_mask, cst_zero, pmax(pldexp(x,fx), _x));
+  return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
 }
 
 // The following code is inspired by the following stack-overflow answer:
@@ -528,29 +508,22 @@
 //    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
 //  - Avoid a branch in rounding and extraction of the remaining fractional part.
 // Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge (float xf, Eigen::numext::int32_t *quadrant)
-{
+inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
   using Eigen::numext::int32_t;
-  using Eigen::numext::uint32_t;
   using Eigen::numext::int64_t;
+  using Eigen::numext::uint32_t;
   using Eigen::numext::uint64_t;
 
-  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
-  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
+  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
 
   // 192 bits of 2/pi for Payne-Hanek reduction
   // Bits are introduced by packet of 8 to enable aligned reads.
-  static const uint32_t two_over_pi [] = 
-  {
-    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
-    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
-    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
-    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
-    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
-    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
-    0x10e41000, 0xe4100000
-  };
-  
+  static const uint32_t two_over_pi[] = {
+      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
+      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
+      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
+
   uint32_t xi = numext::bit_cast<uint32_t>(xf);
   // Below, -118 = -126 + 8.
   //   -126 is to get the exponent,
@@ -558,12 +531,12 @@
   // This is possible because the fractional part of x as only 24 meaningful bits.
   uint32_t e = (xi >> 23) - 118;
   // Extract the mantissa and shift it to align it wrt the exponent
-  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
 
   uint32_t i = e >> 3;
-  uint32_t twoopi_1  = two_over_pi[i-1];
-  uint32_t twoopi_2  = two_over_pi[i+3];
-  uint32_t twoopi_3  = two_over_pi[i+7];
+  uint32_t twoopi_1 = two_over_pi[i - 1];
+  uint32_t twoopi_2 = two_over_pi[i + 3];
+  uint32_t twoopi_3 = two_over_pi[i + 7];
 
   // Compute x * 2/pi in 2.62-bit fixed-point format.
   uint64_t p;
@@ -578,23 +551,23 @@
   // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
   //   r = (p-q)*pi/2,
   // where the product can be be carried out with sufficient accuracy using double precision.
-  p -= q<<62;
+  p -= q << 62;
   return float(double(int64_t(p)) * pio2_62);
 }
 
-template<bool ComputeSine,typename Packet>
+template <bool ComputeSine, typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 #if EIGEN_COMP_GNUC_STRICT
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
 #endif
-Packet psincos_float(const Packet& _x)
-{
+    Packet
+    psincos_float(const Packet& _x) {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
 
-  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
-  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
-  const PacketI csti_1              = pset1<PacketI>(1);
-  const Packet  cst_sign_mask       = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
+  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
+  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
+  const PacketI csti_1 = pset1<PacketI>(1);
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
 
   Packet x = pabs(_x);
 
@@ -604,19 +577,19 @@
   // Rounding trick to find nearest integer:
   Packet y_round = padd(y, cst_rounding_magic);
   EIGEN_OPTIMIZATION_BARRIER(y_round)
-  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
-  y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
+  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
 
-  // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
-  // using "Extended precision modular arithmetic"
-  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
+// using "Extended precision modular arithmetic"
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
   // This version requires true FMA for high accuracy
   // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
   const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
   x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
   x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
   x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
-  #else
+#else
   // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
   // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
   // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
@@ -624,29 +597,28 @@
   // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
   // and 2 ULP up to:
   const float huge_th = ComputeSine ? 25966.f : 18838.f;
-  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
   EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
   EIGEN_OPTIMIZATION_BARRIER(x)
-  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
-  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
 
-  // For the record, the following set of coefficients maintain 2ULP up
-  // to a slightly larger range:
-  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
-  // but it slightly fails to maintain 1ULP for two values of sin below pi.
-  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
-  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
-  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
-  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+// For the record, the following set of coefficients maintain 2ULP up
+// to a slightly larger range:
+// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+// but it slightly fails to maintain 1ULP for two values of sin below pi.
+// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
 
-  // For the record, with only 3 iterations it is possible to maintain
-  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
-  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
-  #endif
+// For the record, with only 3 iterations it is possible to maintain
+// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+#endif
 
-  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
-  {
+  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
     const int PacketSize = unpacket_traits<Packet>::size;
     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
     EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
@@ -654,11 +626,9 @@
     pstoreu(vals, pabs(_x));
     pstoreu(x_cpy, x);
     pstoreu(y_int2, y_int);
-    for(int k=0; k<PacketSize;++k)
-    {
+    for (int k = 0; k < PacketSize; ++k) {
       float val = vals[k];
-      if(val>=huge_th && (numext::isfinite)(val))
-        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
     }
     x = ploadu<Packet>(x_cpy);
     y_int = ploadu<PacketI>(y_int2);
@@ -668,19 +638,19 @@
   // sin: sign = second_bit(y_int) xor signbit(_x)
   // cos: sign = second_bit(y_int+1)
   Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
-                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
-  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
 
   // Get the polynomial selection mask from the second bit of y_int
   // We'll calculate both (sin and cos) polynomials and then select from the two.
   Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
 
-  Packet x2 = pmul(x,x);
+  Packet x2 = pmul(x, x);
 
   // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
-  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
-  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
+  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
   y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
   y1 = pmadd(y1, x2, pset1<Packet>(1.f));
 
@@ -692,38 +662,32 @@
   //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
   //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
   //
-  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
-  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
   y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
   y2 = pmul(y2, x2);
   y2 = pmadd(y2, x, x);
 
   // Select the correct result from the two polynomials.
-  y = ComputeSine ? pselect(poly_mask,y2,y1)
-                  : pselect(poly_mask,y1,y2);
+  y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
 
   // Update the sign and filter huge inputs
   return pxor(y, sign_bit);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin_float(const Packet& x)
-{
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
   return psincos_float<true>(x);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos_float(const Packet& x)
-{
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
   return psincos_float<false>(x);
 }
 
 // Generic implementation of acos(x).
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
@@ -747,7 +711,7 @@
   //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
   // We evaluate even and odd terms independently to increase
   // instruction level parallelism.
-  Packet x2 = pmul(x_in,x_in);
+  Packet x2 = pmul(x_in, x_in);
   Packet p_even = pmadd(p6, x2, p4);
   Packet p_odd = pmadd(p5, x2, p3);
   p_even = pmadd(p_even, x2, p2);
@@ -765,9 +729,8 @@
 }
 
 // Generic implementation of asin(x).
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
@@ -817,9 +780,8 @@
 }
 
 // Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_reduced_float(const Packet& x) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_float(const Packet& x) {
   const Packet q0 = pset1<Packet>(-0.3333314359188079833984375f);
   const Packet q2 = pset1<Packet>(0.19993579387664794921875f);
   const Packet q4 = pset1<Packet>(-0.14209578931331634521484375f);
@@ -849,9 +811,8 @@
   return pmadd(q, pmul(x, x2), x);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x_in) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
 
@@ -879,28 +840,17 @@
 // Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
 // with 2 ulp accuracy.
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-patan_reduced_double(const Packet& x) {
-  const Packet q0 =
-      pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
-  const Packet q2 =
-      pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
-  const Packet q4 =
-      pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
-  const Packet q6 =
-      pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
-  const Packet q8 =
-      pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
-  const Packet q10 =
-      pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
-  const Packet q12 =
-      pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
-  const Packet q14 =
-      pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
-  const Packet q16 =
-      pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
-  const Packet q18 =
-      pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_double(const Packet& x) {
+  const Packet q0 = pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
+  const Packet q2 = pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
+  const Packet q4 = pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
+  const Packet q6 = pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
+  const Packet q8 = pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
+  const Packet q10 = pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
+  const Packet q12 = pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
+  const Packet q14 = pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
+  const Packet q16 = pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
+  const Packet q18 = pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
 
   // Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
   //   P(x) = x + x^3 * Q(x^2),
@@ -922,9 +872,8 @@
   return pmadd(p, pmul(x, x2), x);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_double(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x_in) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
 
@@ -968,9 +917,8 @@
   return pxor(p, x_signmask);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patanh_float(const Packet& x) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
   const Packet half = pset1<Packet>(0.5f);
@@ -982,12 +930,12 @@
   const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
   const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
   const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
-  const Packet x2 = pmul(x,x);
+  const Packet x2 = pmul(x, x);
   Packet p = pmadd(C11, x2, C9);
   p = pmadd(x2, p, C7);
   p = pmadd(x2, p, C5);
   p = pmadd(x2, p, C3);
-  p = pmadd(pmul(x,x2), p, x);
+  p = pmadd(pmul(x, x2), p, x);
 
   // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
   const Packet one = pset1<Packet>(1.0f);
@@ -996,19 +944,18 @@
   return pselect(x_gt_half, r, p);
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdiv_complex(const Packet& x, const Packet& y) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::as_real RealPacket;
   // In the following we annotate the code for the case where the inputs
   // are a pair length-2 SIMD vectors representing a single pair of complex
   // numbers x = a + i*b, y = c + i*d.
-  const RealPacket y_abs = pabs(y.v);  // |c|, |d|
-  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
-  const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
-  const RealPacket y_scaled = pdiv(y.v, y_max);  // c / max(|c|, |d|), d / max(|c|, |d|)
+  const RealPacket y_abs = pabs(y.v);                        // |c|, |d|
+  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v;  // |d|, |c|
+  const RealPacket y_max = pmax(y_abs, y_abs_flip);          // max(|c|, |d|), max(|c|, |d|)
+  const RealPacket y_scaled = pdiv(y.v, y_max);              // c / max(|c|, |d|), d / max(|c|, |d|)
   // Compute scaled denominator.
-  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
+  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled);  // c'**2, d'**2
   const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
   Packet result_scaled = pmul(x, pconj(Packet(y_scaled)));  // a * c' + b * d', -a * d + b * c
   // Divide elementwise by denom.
@@ -1017,9 +964,8 @@
   return Packet(pdiv(result_scaled.v, y_max));
 }
 
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt_complex(const Packet& a) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   typedef typename Scalar::value_type RealScalar;
   typedef typename unpacket_traits<Packet>::as_real RealPacket;
@@ -1060,14 +1006,14 @@
   //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
   // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
 
-  RealPacket a_abs = pabs(a.v);           // [|x0|, |y0|, |x1|, |y1|]
-  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
   RealPacket a_max = pmax(a_abs, a_abs_flip);
   RealPacket a_min = pmin(a_abs, a_abs_flip);
   RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
   RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
   RealPacket r = pdiv(a_min, a_max);
-  const RealPacket cst_one  = pset1<RealPacket>(RealScalar(1));
+  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
   RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
   // Set l to a_max if a_min is zero.
   l = pselect(a_min_zero_mask, a_max, l);
@@ -1090,8 +1036,7 @@
 
   // Step 4. Compute solution for inputs with negative real part:
   //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
-  const RealPacket cst_imag_sign_mask =
-      pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
   RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
   Packet negative_real_result;
   // Notice that rho is positive, so taking it's absolute value is a noop.
@@ -1131,7 +1076,6 @@
   return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
 }
 
-
 template <typename Packet>
 struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
                                            !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
@@ -1222,18 +1166,16 @@
 
 // This function splits x into the nearest integer n and fractional part r,
 // such that x = n + r holds exactly.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void absolute_split(const Packet& x, Packet& n, Packet& r) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
   n = pround(x);
   r = psub(x, n);
 }
 
 // This function computes the sum {s, r}, such that x + y = s_hi + s_lo
 // holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
   s_hi = padd(x, y);
   const Packet t = psub(s_hi, x);
   s_lo = psub(y, t);
@@ -1244,10 +1186,8 @@
 // a pair of floating point numbers. Given {x, y}, it computes the pair
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
 // p_hi = fl(x * y).
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x, const Packet& y,
-             Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
   p_hi = pmul(x, y);
   p_lo = pmsub(x, y, p_hi);
 }
@@ -1259,9 +1199,8 @@
 // exactly and that half of the significant of x fits in x_hi.
 // This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
   typedef typename unpacket_traits<Packet>::type Scalar;
   EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
   const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
@@ -1275,10 +1214,8 @@
 // Given floating point numbers {x, y} computes the pair
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
 // p_hi = fl(x * y).
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x, const Packet& y,
-             Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
   Packet x_hi, x_lo, y_hi, y_lo;
   veltkamp_splitting(x, x_hi, x_lo);
   veltkamp_splitting(y, y_hi, y_lo);
@@ -1292,23 +1229,20 @@
 
 #endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 
-
 // This function implements Dekker's algorithm for the addition
 // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
 // It returns the result as a pair {s_hi, s_lo} such that
 // x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
 // This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-  void twosum(const Packet& x_hi, const Packet& x_lo,
-              const Packet& y_hi, const Packet& y_lo,
-              Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+                                Packet& s_hi, Packet& s_lo) {
   const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
   Packet r_hi_1, r_lo_1;
-  fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
+  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
   Packet r_hi_2, r_lo_2;
-  fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
+  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
   const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
 
   const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
@@ -1320,11 +1254,9 @@
 
 // This is a version of twosum for double word numbers,
 // which assumes that |x_hi| >= |y_hi|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-  void fast_twosum(const Packet& x_hi, const Packet& x_lo,
-              const Packet& y_hi, const Packet& y_lo,
-              Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+                                     Packet& s_hi, Packet& s_lo) {
   Packet r_hi, r_lo;
   fast_twosum(x_hi, y_hi, r_hi, r_lo);
   const Packet s = padd(padd(y_lo, r_lo), x_lo);
@@ -1334,11 +1266,9 @@
 // This is a version of twosum for adding a floating point number x to
 // double word number {y_hi, y_lo} number, with the assumption
 // that |x| >= |y_hi|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void fast_twosum(const Packet& x,
-                 const Packet& y_hi, const Packet& y_lo,
-                 Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, Packet& s_hi,
+                                     Packet& s_lo) {
   Packet r_hi, r_lo;
   fast_twosum(x, y_hi, r_hi, r_lo);
   const Packet s = padd(y_lo, r_lo);
@@ -1353,10 +1283,8 @@
 // in the floating point type.
 // This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
 // 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-             Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& p_hi, Packet& p_lo) {
   Packet c_hi, c_lo1;
   twoprod(x_hi, y, c_hi, c_lo1);
   const Packet c_lo2 = pmul(x_lo, y);
@@ -1372,11 +1300,9 @@
 // (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
 // of less than 2*2^{-2p}, where p is the number of significand bit
 // in the floating point type.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x_hi, const Packet& x_lo,
-             const Packet& y_hi, const Packet& y_lo,
-             Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+                                 Packet& p_hi, Packet& p_lo) {
   Packet p_hi_hi, p_hi_lo;
   twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
   Packet p_lo_hi, p_lo_lo;
@@ -1389,8 +1315,7 @@
 // for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
 // 2017. https://hal.archives-ouvertes.fr/hal-01351529
 template <typename Packet>
-void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
-                           Packet& z_hi, Packet& z_lo) {
+void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& z_hi, Packet& z_lo) {
   const Packet t_hi = pdiv(x_hi, y);
   Packet pi_hi, pi_lo;
   twoprod(t_hi, y, pi_hi, pi_lo);
@@ -1405,8 +1330,7 @@
 template <typename Scalar>
 struct accurate_log2 {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+  EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
     log2_x_hi = plog2(x);
     log2_x_lo = pzero(x);
   }
@@ -1421,8 +1345,7 @@
 template <>
 struct accurate_log2<float> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+  EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
     // The function log(1+x)/x is approximated in the interval
     // [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
     //  Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
@@ -1437,14 +1360,14 @@
     // > f = log2(1+x)/x;
     // > interval = [sqrt(0.5)-1;sqrt(2)-1];
     // > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
-    
-    const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
+
+    const Packet p6 = pset1<Packet>(9.703654795885e-2f);
     const Packet p5 = pset1<Packet>(-0.1690667718648f);
-    const Packet p4 = pset1<Packet>( 0.1720575392246f);
+    const Packet p4 = pset1<Packet>(0.1720575392246f);
     const Packet p3 = pset1<Packet>(-0.1789081543684f);
-    const Packet p2 = pset1<Packet>( 0.2050433009862f);
+    const Packet p2 = pset1<Packet>(0.2050433009862f);
     const Packet p1 = pset1<Packet>(-0.2404672354459f);
-    const Packet p0 = pset1<Packet>( 0.2885761857032f);
+    const Packet p0 = pset1<Packet>(0.2885761857032f);
 
     const Packet C3_hi = pset1<Packet>(-0.360674142838f);
     const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
@@ -1460,7 +1383,7 @@
     // Evaluate P(x) in working precision.
     // We evaluate it in multiple parts to improve instruction level
     // parallelism.
-    Packet x2 = pmul(x,x);
+    Packet x2 = pmul(x, x);
     Packet p_even = pmadd(p6, x2, p4);
     p_even = pmadd(p_even, x2, p2);
     p_even = pmadd(p_even, x2, p0);
@@ -1502,8 +1425,7 @@
 template <>
 struct accurate_log2<double> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+  EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
     // We use a transformation of variables:
     //    r = c * (x-1) / (x+1),
     // such that
@@ -1588,8 +1510,7 @@
 template <typename Scalar>
 struct fast_accurate_exp2 {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  Packet operator()(const Packet& x) {
+  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
     // TODO(rmlarsen): Add a pexp2 packetop.
     return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
   }
@@ -1602,8 +1523,7 @@
 template <>
 struct fast_accurate_exp2<float> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  Packet operator()(const Packet& x) {
+  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
     // This function approximates exp2(x) by a degree 6 polynomial of the form
     // Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
     // single precision, and the remaining steps are evaluated with extra precision using
@@ -1628,7 +1548,7 @@
     // Evaluate P(x) in working precision.
     // We evaluate even and odd parts of the polynomial separately
     // to gain some instruction level parallelism.
-    Packet x2 = pmul(x,x);
+    Packet x2 = pmul(x, x);
     Packet p_even = pmadd(p4, x2, p2);
     Packet p_odd = pmadd(p3, x2, p1);
     p_even = pmadd(p_even, x2, p0);
@@ -1660,8 +1580,7 @@
 template <>
 struct fast_accurate_exp2<double> {
   template <typename Packet>
-  EIGEN_STRONG_INLINE
-  Packet operator()(const Packet& x) {
+  EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
     // This function approximates exp2(x) by a degree 10 polynomial of the form
     // Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
     // single precision, and the remaining steps are evaluated with extra precision using
@@ -1683,14 +1602,14 @@
     const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
     const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
     const Packet p0 = pset1<Packet>(0.240226506959101332);
-    const Packet C_hi = pset1<Packet>(0.693147180559945286); 
+    const Packet C_hi = pset1<Packet>(0.693147180559945286);
     const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
     const Packet one = pset1<Packet>(1.0);
 
     // Evaluate P(x) in working precision.
     // We evaluate even and odd parts of the polynomial separately
     // to gain some instruction level parallelism.
-    Packet x2 = pmul(x,x);
+    Packet x2 = pmul(x, x);
     Packet p_even = pmadd(p8, x2, p6);
     Packet p_odd = pmadd(p9, x2, p7);
     p_even = pmadd(p_even, x2, p4);
@@ -1885,15 +1804,17 @@
  */
 template <typename Packet, int N>
 struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
     EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
   }
 };
 
 template <typename Packet>
 struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
     EIGEN_UNUSED_VARIABLE(x);
     return pset1<Packet>(coeff[0]);
   }
@@ -1953,8 +1874,8 @@
 
 template <typename Packet, int N>
 struct pchebevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
     typedef typename unpacket_traits<Packet>::type Scalar;
     Packet b0 = pset1<Packet>(coef[0]);
     Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
@@ -2052,14 +1973,14 @@
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
-                                                            const typename unpacket_traits<Packet>::type& exponent) {
+                                                     const typename unpacket_traits<Packet>::type& exponent) {
   const Packet exponent_packet = pset1<Packet>(exponent);
   return generic_pow_impl(x, exponent_packet);
 }
 
 template <typename Packet, typename ScalarExponent>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
-                                                                                const ScalarExponent& exponent) {
+                                                                         const ScalarExponent& exponent) {
   using Scalar = typename unpacket_traits<Packet>::type;
 
   // non-integer base and exponent case
@@ -2153,7 +2074,6 @@
   return pand(x_is_one, x);
 }
 
-
 }  // end namespace unary_pow
 
 template <typename Packet, typename ScalarExponent,
@@ -2205,7 +2125,7 @@
   }
 };
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 9e038ab..ade9f3f 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -22,110 +22,96 @@
 
 /***************************************************************************
  * Some generic implementations to be used by implementors
-***************************************************************************/
+ ***************************************************************************/
 
 /** Default implementation of pfrexp.
-  * It is expected to be called by implementers of template<> pfrexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic(const Packet& a, Packet& exponent);
+ * It is expected to be called by implementers of template<> pfrexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent);
 
 // Extracts the biased exponent value from Packet p, and casts the results to
 // a floating-point Packet type. Used by pfrexp_generic. Override this if
 // there is no unpacket_traits<Packet>::integer_packet.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic_get_biased_exponent(const Packet& p);
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& p);
 
 /** Default implementation of pldexp.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pldexp_generic(const Packet& a, const Packet& exponent);
+ * It is expected to be called by implementers of template<> pldexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
 
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
 
 /** \internal \returns log2(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x);
 
 /** \internal \returns log(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x);
 
 /** \internal \returns log2(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x);
 
 /** \internal \returns log(1 + x) */
-template<typename Packet>
+template <typename Packet>
 Packet generic_plog1p(const Packet& x);
 
 /** \internal \returns exp(x)-1 */
-template<typename Packet>
+template <typename Packet>
 Packet generic_expm1(const Packet& x);
 
 /** \internal \returns exp(x) for single precision float */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
 
 /** \internal \returns exp(x) for double precision real numbers */
 template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x);
 
 /** \internal \returns sin(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x);
 
 /** \internal \returns cos(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
 
 /** \internal \returns asin(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
 
 /** \internal \returns acos(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x);
 
 /** \internal \returns atan(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x);
 
 /** \internal \returns atan(x) for double precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_double(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x);
 
 /** \internal \returns atanh(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patanh_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x);
 
 /** \internal \returns sqrt(x) for complex types */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt_complex(const Packet& a);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a);
 
 /** \internal \returns x / y for complex types */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdiv_complex(const Packet& x, const Packet& y);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y);
 
-template <typename Packet, int N> struct ppolevl;
+template <typename Packet, int N>
+struct ppolevl;
 
 // Macros for instantiating these generic functions for different backends.
 #define EIGEN_PACKET_FUNCTION(METHOD, SCALAR, PACKET)                                             \
@@ -166,7 +152,7 @@
   EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET)                \
   EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index c652318..92516c7 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -24,7 +24,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
 // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
@@ -32,7 +31,6 @@
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 
-
 #ifndef EIGEN_HALF_H
 #define EIGEN_HALF_H
 
@@ -46,16 +44,15 @@
 // As a consequence, we get compile failures when compiling Eigen with
 // GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
 // Eigen with GPU support
-  #pragma push_macro("EIGEN_CONSTEXPR")
-  #undef EIGEN_CONSTEXPR
-  #define EIGEN_CONSTEXPR
+#pragma push_macro("EIGEN_CONSTEXPR")
+#undef EIGEN_CONSTEXPR
+#define EIGEN_CONSTEXPR
 #endif
 
-#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)           \
-  template <>                                                       \
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED                \
-  PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) {             \
-    return float2half(METHOD<PACKET_F>(half2float(_x)));            \
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)                                                  \
+  template <>                                                                                              \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+    return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
   }
 
 namespace Eigen {
@@ -97,8 +94,7 @@
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
 #endif
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {
-  }
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
   __fp16 x;
 #else
   explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
@@ -107,15 +103,15 @@
 };
 
 #elif defined(EIGEN_HAS_HIP_FP16)
-  // Nothing to do here
-  // HIP fp16 header file has a definition for __half_raw
+// Nothing to do here
+// HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if EIGEN_CUDA_SDK_VER < 90000
-    // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-    typedef __half __half_raw;
-  #endif // defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
+#endif  // defined(EIGEN_HAS_CUDA_FP16)
 #elif defined(SYCL_DEVICE_ONLY)
-  typedef cl::sycl::half __half_raw;
+typedef cl::sycl::half __half_raw;
 #endif
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
@@ -127,21 +123,20 @@
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER >= 90000
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-  #endif
- #endif
+#endif
+#endif
 #endif
 };
 
-} // namespace half_impl
+}  // namespace half_impl
 
 // Class definition.
 struct half : public half_impl::half_base {
-
   // Writing this out as separate #if-else blocks to make the code easier to follow
   // The same applies to most #if-else blocks in this file
 #if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
@@ -153,12 +148,12 @@
   // Nothing to do here
   // HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
-  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    typedef half_impl::__half_raw __half_raw;
-  #endif
+// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+// #if defined(EIGEN_HAS_CUDA_FP16) is needed
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+  typedef half_impl::__half_raw __half_raw;
+#endif
 #endif
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
@@ -166,31 +161,29 @@
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
 
 #if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-  #endif
- #endif
 #endif
-
+#endif
+#endif
 
   explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  template<class T>
+  template <class T>
   explicit EIGEN_DEVICE_FUNC half(T val)
       : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
-  explicit EIGEN_DEVICE_FUNC half(float f)
-      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+  explicit EIGEN_DEVICE_FUNC half(float f) : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
 
   // Following the convention of numpy, converting between complex and
   // float will lead to loss of imag value.
-  template<typename RealScalar>
+  template <typename RealScalar>
   explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
       : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
 
-   EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
     return half_impl::half_to_float(*this);
   }
 
@@ -224,8 +217,10 @@
   static EIGEN_CONSTEXPR const bool is_bounded = true;
   static EIGEN_CONSTEXPR const bool is_modulo = false;
   static EIGEN_CONSTEXPR const int digits = 11;
-  static EIGEN_CONSTEXPR const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static EIGEN_CONSTEXPR const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static EIGEN_CONSTEXPR const int digits10 =
+      3;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static EIGEN_CONSTEXPR const int max_digits10 =
+      5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
   static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
   static EIGEN_CONSTEXPR const int min_exponent = -13;
   static EIGEN_CONSTEXPR const int min_exponent10 = -4;
@@ -236,9 +231,9 @@
   // detect tininess in the same way for all operations in radix two"
   static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
 
-  static EIGEN_CONSTEXPR Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+  static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
   static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
-  static EIGEN_CONSTEXPR Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
   static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
   static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
   static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
@@ -247,51 +242,51 @@
   static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
 };
 
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps;
-template<typename T>
+template <typename T>
 EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before;
 }  // end namespace half_impl
 }  // end namespace Eigen
@@ -301,13 +296,13 @@
 // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
 // std::numeric_limits<const volatile T>
 // https://stackoverflow.com/a/16519653/
-template<>
+template <>
 class numeric_limits<Eigen::half> : public Eigen::half_impl::numeric_limits_half_impl<> {};
-template<>
+template <>
 class numeric_limits<const Eigen::half> : public numeric_limits<Eigen::half> {};
-template<>
+template <>
 class numeric_limits<volatile Eigen::half> : public numeric_limits<Eigen::half> {};
-template<>
+template <>
 class numeric_limits<const volatile Eigen::half> : public numeric_limits<Eigen::half> {};
 }  // end namespace std
 
@@ -315,8 +310,7 @@
 
 namespace half_impl {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                  \
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU halfs are rather different from native CPU halfs.
@@ -330,20 +324,16 @@
 // conversion steps back and forth.
 
 #if defined(EIGEN_HAS_NATIVE_FP16)
-EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hadd(::__half(a), ::__half(b));
 #else
   return __hadd(a, b);
 #endif
 }
-EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
-  return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
-  return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
 #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hdiv(a, b);
 #else
@@ -352,99 +342,63 @@
   return __float2half(num / denom);
 #endif
 }
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
-  return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
+EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
   a = a + b;
   return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
   a = a * b;
   return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
   a = a - b;
   return a;
 }
-EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
   a = a / b;
   return a;
 }
-EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
-  return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
-  return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
-  return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
-  return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
-  return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
-  return __hge(a, b);
-}
+EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { return __heq(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { return __hne(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { return __hlt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
 #endif
 
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
-  return half(vaddh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
-  return half(vmulh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
-  return half(vsubh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
-  return half(vdivh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
-  return half(vnegh_f16(a.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(vmulh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(vsubh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(vdivh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(vnegh_f16(a.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
   a = half(vaddh_f16(a.x, b.x));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
   a = half(vmulh_f16(a.x, b.x));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
   a = half(vsubh_f16(a.x, b.x));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
   a = half(vdivh_f16(a.x, b.x));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return vceqh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return !vceqh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
-  return vclth_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
-  return vcleh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
-  return vcgth_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
-  return vcgeh_f16(a.x, b.x);
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return vclth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
-#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
+#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* FP16 operators for clang.
@@ -452,64 +406,48 @@
 #undef EIGEN_DEVICE_FUNC
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
 #define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
+#else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
 #endif
 #endif
 
 // Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
-  return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
-  return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
-  return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
-  return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(float(a) + float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(float(a) * float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(float(a) - float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(float(a) / float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
   half result;
   result.x = a.x ^ 0x8000;
   return result;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
   a = half(float(a) + float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
   a = half(float(a) * float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
   a = half(float(a) - float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
   a = half(float(a) / float(b));
   return a;
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return numext::equal_strict(float(a),float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
+  return numext::equal_strict(float(a), float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
   return numext::not_equal_strict(float(a), float(b));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
-  return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
-  return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
-  return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
-  return float(a) >= float(b);
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }
 
 #if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 #pragma pop_macro("EIGEN_DEVICE_FUNC")
@@ -518,7 +456,7 @@
 
 // Division by an index. Do it in full float precision to avoid accuracy
 // issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
   return half(static_cast<float>(a) / static_cast<float>(b));
 }
 
@@ -557,8 +495,8 @@
   // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
   // of this catch22 by having separate bodies for GPU / non GPU
 #if defined(EIGEN_HAS_GPU_FP16)
-   __half_raw h;
-   h.x = x;
+  __half_raw h;
+  h.x = x;
   return h;
 #else
   return __half_raw(x);
@@ -585,18 +523,18 @@
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
 #elif defined(EIGEN_HAS_FP16_C)
   __half_raw h;
-  #if EIGEN_COMP_MSVC
-    // MSVC does not have scalar instructions.
-    h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
-  #else
-    h.x = _cvtss_sh(ff, 0);
-  #endif
+#if EIGEN_COMP_MSVC
+  // MSVC does not have scalar instructions.
+  h.x = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
+#else
+  h.x = _cvtss_sh(ff, 0);
+#endif
   return h;
 
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
@@ -605,11 +543,12 @@
   return h;
 
 #else
-  float32_bits f; f.f = ff;
+  float32_bits f;
+  f.f = ff;
 
-  const float32_bits f32infty = { 255 << 23 };
-  const float32_bits f16max = { (127 + 16) << 23 };
-  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  const float32_bits f32infty = {255 << 23};
+  const float32_bits f16max = {(127 + 16) << 23};
+  const float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
   unsigned int sign_mask = 0x80000000u;
   __half_raw o;
   o.x = static_cast<numext::uint16_t>(0x0u);
@@ -622,10 +561,10 @@
   // 0x80000000. Important if you want fast straight SSE2 code
   // (since there's no unsigned PCMPGTD).
 
-  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
-  } else {  // (De)normalized number or zero
-    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
+  if (f.u >= f16max.u) {                         // result is Inf or NaN (all exponent bits set)
+    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                       // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                     // resulting FP16 is subnormal or zero
       // use a magic value to align our 10 mantissa bits at the bottom of
       // the float. as long as FP addition is round-to-nearest-even this
       // just works.
@@ -634,7 +573,7 @@
       // and one integer subtract of the bias later, we have our final float!
       o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
     } else {
-      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
 
       // update exponent, rounding bias part 1
       // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
@@ -654,51 +593,51 @@
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
 #elif defined(EIGEN_HAS_FP16_C)
-  #if EIGEN_COMP_MSVC
-    // MSVC does not have scalar instructions.
-    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
-  #else
-    return _cvtsh_ss(h.x);
-  #endif
+#if EIGEN_COMP_MSVC
+  // MSVC does not have scalar instructions.
+  return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
+#else
+  return _cvtsh_ss(h.x);
+#endif
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return static_cast<float>(h.x);
 #else
-  const float32_bits magic = { 113 << 23 };
-  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  const float32_bits magic = {113 << 23};
+  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
   float32_bits o;
 
-  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;   // just the exponent
-  o.u += (127 - 15) << 23;                // exponent adjust
+  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;               // exponent adjust
 
   // handle exponent special cases
-  if (exp == shifted_exp) {     // Inf/NaN?
-    o.u += (128 - 16) << 23;    // extra exp adjust
-  } else if (exp == 0) {        // Zero/Denormal?
-    o.u += 1 << 23;             // extra exp adjust
-    o.f -= magic.f;             // renormalize
+  if (exp == shifted_exp) {   // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {      // Zero/Denormal?
+    o.u += 1 << 23;           // extra exp adjust
+    o.f -= magic.f;           // renormalize
   }
 
-  o.u |= (h.x & 0x8000) << 16;    // sign bit
+  o.u |= (h.x & 0x8000) << 16;  // sign bit
   return o.f;
 #endif
 }
 
 // --- standard functions ---
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
 #ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
 #else
   return (a.x & 0x7fff) == 0x7c00;
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -706,8 +645,8 @@
   return (a.x & 0x7fff) > 0x7c00;
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
+  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
@@ -721,39 +660,34 @@
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+    defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
-   return half(::expf(float(a)));
+  return half(::expf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return half(::hlog(a));
 #else
   return half(::logf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return half(::log10f(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { return half(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
   return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+    defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
-    return half(::sqrtf(float(a)));
+  return half(::sqrtf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
@@ -762,33 +696,17 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan2(const half& a, const half& b) {
   return half(::atan2f(float(a), float(b)));
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
-  return half(::asinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
-  return half(::acosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) {
-  return half(::atanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) {
-  return half(::atanhf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { return half(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { return half(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { return half(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { return half(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+    defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
@@ -796,25 +714,21 @@
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+    defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
-  return half(::rintf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) {
-  return half(::roundf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { return half(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { return half(::roundf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
   return half(::fmodf(float(a), float(b)));
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(b, a) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -822,9 +736,9 @@
   return f2 < f1 ? b : a;
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(a, b) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -834,51 +748,43 @@
 }
 
 #ifndef EIGEN_NO_IO
-EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
   os << static_cast<float>(v);
   return os;
 }
 #endif
 
-} // end namespace half_impl
+}  // end namespace half_impl
 
 // import Eigen::half_impl::half into Eigen namespace
 // using half_impl::half;
 
 namespace internal {
 
-template<>
-struct random_default_impl<half, false, false>
-{
-  static inline half run(const half& x, const half& y)
-  {
-    return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+template <>
+struct random_default_impl<half, false, false> {
+  static inline half run(const half& x, const half& y) {
+    return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
   }
-  static inline half run()
-  {
-    return run(half(-1.f), half(1.f));
-  }
+  static inline half run() { return run(half(-1.f), half(1.f)); }
 };
 
-template<> struct is_arithmetic<half> { enum { value = true }; };
+template <>
+struct is_arithmetic<half> {
+  enum { value = true };
+};
 
-} // end namespace internal
+}  // end namespace internal
 
-template<> struct NumTraits<Eigen::half>
-    : GenericNumTraits<Eigen::half>
-{
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
+template <>
+struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
+  enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
     return half_impl::raw_uint16_to_half(0x0800);
   }
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
-    return half_impl::raw_uint16_to_half(0x211f); //  Eigen::half(1e-2f);
+    return half_impl::raw_uint16_to_half(0x211f);  //  Eigen::half(1e-2f);
   }
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
     return half_impl::raw_uint16_to_half(0x7bff);
@@ -894,10 +800,10 @@
   }
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
-  #pragma pop_macro("EIGEN_CONSTEXPR")
+#pragma pop_macro("EIGEN_CONSTEXPR")
 #endif
 
 namespace Eigen {
@@ -946,63 +852,65 @@
 //    with native support for __half and __nv_bfloat16
 //
 // Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
-    || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
 
 #if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
+                                                       int width = warpSize) {
   const __half h = var;
   return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                          int width = warpSize) {
   const __half h = var;
   return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                            int width = warpSize) {
   const __half h = var;
   return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
+                                                           int width = warpSize) {
   const __half h = var;
   return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }
 
-#else // HIP or CUDA SDK < 9.0
+#else  // HIP or CUDA SDK < 9.0
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
 }
 
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
   return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
 }
 
-#endif // HIP vs CUDA
-#endif // __shfl*
+#endif  // HIP vs CUDA
+#endif  // __shfl*
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
-    || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
-#endif // __ldg
+#endif  // __ldg
 
 #if EIGEN_HAS_STD_HASH
 namespace std {
@@ -1012,7 +920,7 @@
     return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
   }
 };
-} // end namespace std
+}  // end namespace std
 #endif
 
 namespace Eigen {
@@ -1020,8 +928,7 @@
 
 template <>
 struct cast_impl<float, half> {
-  EIGEN_DEVICE_FUNC
-  static inline half run(const float& a) {
+  EIGEN_DEVICE_FUNC static inline half run(const float& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
     return __float2half(a);
@@ -1033,8 +940,7 @@
 
 template <>
 struct cast_impl<int, half> {
-  EIGEN_DEVICE_FUNC
-  static inline half run(const int& a) {
+  EIGEN_DEVICE_FUNC static inline half run(const int& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
     return __float2half(static_cast<float>(a));
@@ -1046,8 +952,7 @@
 
 template <>
 struct cast_impl<half, float> {
-  EIGEN_DEVICE_FUNC
-  static inline float run(const half& a) {
+  EIGEN_DEVICE_FUNC static inline float run(const half& a) {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
     (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
     return __half2float(a);
@@ -1060,4 +965,4 @@
 }  // namespace internal
 }  // namespace Eigen
 
-#endif // EIGEN_HALF_H
+#endif  // EIGEN_HALF_H
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index a5c3ada..7e3a970 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -8,7 +8,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 /* All the parameters defined in this file can be specialized in the
  * architecture specific files, and/or by the user.
  * More to come... */
@@ -17,33 +16,32 @@
 #define EIGEN_DEFAULT_SETTINGS_H
 
 /** Defines the maximal loop size to enable meta unrolling of loops.
-  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
-  * it does not correspond to the number of iterations or the number of instructions
-  */
+ * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+ * it does not correspond to the number of iterations or the number of instructions
+ */
 #ifndef EIGEN_UNROLLING_LIMIT
 #define EIGEN_UNROLLING_LIMIT 110
 #endif
 
 /** Defines the threshold between a "small" and a "large" matrix.
-  * This threshold is mainly used to select the proper product implementation.
-  */
+ * This threshold is mainly used to select the proper product implementation.
+ */
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
 /** Defines the maximal width of the blocks used in the triangular product and solver
-  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
-  */
+ * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+ */
 #ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
 #define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
 #endif
 
-
 /** Defines the default number of registers available for that architecture.
-  * Currently it must be 8 or 16. Other values will fail.
-  */
+ * Currently it must be 8 or 16. Other values will fail.
+ */
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
 #endif
 
-#endif // EIGEN_DEFAULT_SETTINGS_H
+#endif  // EIGEN_DEFAULT_SETTINGS_H
diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index 8a7869c..fa46aec 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h
@@ -31,7 +31,7 @@
 //    to the first inclusion of <complex>.
 
 #if defined(EIGEN_GPUCC) && defined(EIGEN_GPU_COMPILE_PHASE)
-    
+
 // ICC already specializes std::complex<float> and std::complex<double>
 // operators, preventing us from making them device functions here.
 // This will lead to silent runtime errors if the operators are used on device.
@@ -62,33 +62,30 @@
 // Specialized std::complex overloads.
 namespace complex_operator_detail {
 
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
+                                                                       const std::complex<T>& b) {
   const T a_real = numext::real(a);
   const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
-  return std::complex<T>(
-      a_real * b_real - a_imag * b_imag,
-      a_imag * b_real + a_real * b_imag);
+  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
 }
 
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
+                                                                          const std::complex<T>& b) {
   const T a_real = numext::real(a);
   const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
   const T norm = (b_real * b_real + b_imag * b_imag);
-  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
-                          (a_imag * b_real - a_real * b_imag) / norm);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
 }
 
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_stable(const std::complex<T>& a,
+                                                                            const std::complex<T>& b) {
   const T a_real = numext::real(a);
   const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
@@ -99,13 +96,13 @@
   const T rscale = scale_imag ? T(1) : b_real / b_imag;
   const T iscale = scale_imag ? b_imag / b_real : T(1);
   const T denominator = b_real * rscale + b_imag * iscale;
-  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
                          (a_imag * rscale - a_real * iscale) / denominator);
 }
 
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
+                                                                     const std::complex<T>& b) {
 #if EIGEN_FAST_MATH
   return complex_divide_fast(a, b);
 #else
@@ -118,131 +115,107 @@
 //       since they are already specialized for float/double/long double within
 //       the standard <complex> header. We also do not specialize the stream
 //       operators.
-#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                    \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator+(const std::complex<T>& a) { return a; }                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator-(const std::complex<T>& a) {                                           \
-  return std::complex<T>(-numext::real(a), -numext::imag(a));                                   \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) {                 \
-  return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator+(const std::complex<T>& a, const T& b) {                               \
-  return std::complex<T>(numext::real(a) + b, numext::imag(a));                                 \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator+(const T& a, const std::complex<T>& b) {                               \
-  return std::complex<T>(a + numext::real(b), numext::imag(b));                                 \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) {                 \
-  return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator-(const std::complex<T>& a, const T& b) {                               \
-  return std::complex<T>(numext::real(a) - b, numext::imag(a));                                 \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator-(const T& a, const std::complex<T>& b) {                               \
-  return std::complex<T>(a - numext::real(b), -numext::imag(b));                                \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) {                 \
-  return complex_multiply(a, b);                                                                \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator*(const std::complex<T>& a, const T& b) {                               \
-  return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                             \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator*(const T& a, const std::complex<T>& b) {                               \
-  return std::complex<T>(a * numext::real(b), a * numext::imag(b));                             \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) {                 \
-  return complex_divide(a, b);                                                                  \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator/(const std::complex<T>& a, const T& b) {                               \
-  return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                             \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T> operator/(const T& a, const std::complex<T>& b) {                               \
-  return complex_divide(std::complex<T>(a, 0), b);                                              \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) {                     \
-  numext::real_ref(a) += numext::real(b);                                                       \
-  numext::imag_ref(a) += numext::imag(b);                                                       \
-  return a;                                                                                     \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) {                     \
-  numext::real_ref(a) -= numext::real(b);                                                       \
-  numext::imag_ref(a) -= numext::imag(b);                                                       \
-  return a;                                                                                     \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) {                     \
-  a = complex_multiply(a, b);                                                                   \
-  return a;                                                                                     \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) {                     \
-  a = complex_divide(a, b);                                                                     \
-  return  a;                                                                                    \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator==(const std::complex<T>& a, const std::complex<T>& b) {                           \
-  return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);              \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator==(const std::complex<T>& a, const T& b) {                                         \
-  return numext::real(a) == b && numext::imag(a) == 0;                                          \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator==(const T& a, const std::complex<T>& b) {                                         \
-  return a  == numext::real(b) && 0 == numext::imag(b);                                         \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {                           \
-  return !(a == b);                                                                             \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator!=(const std::complex<T>& a, const T& b) {                                         \
-  return !(a == b);                                                                             \
-}                                                                                               \
-                                                                                                \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                           \
-bool operator!=(const T& a, const std::complex<T>& b) {                                         \
-  return !(a == b);                                                                             \
-}
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                                        \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a) { return a; }           \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a) {                       \
+    return std::complex<T>(-numext::real(a), -numext::imag(a));                                                     \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b));                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) + b, numext::imag(a));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a + numext::real(b), numext::imag(b));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b));                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) - b, numext::imag(a));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a - numext::real(b), -numext::imag(b));                                                  \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return complex_multiply(a, b);                                                                                  \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a * numext::real(b), a * numext::imag(b));                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return complex_divide(a, b);                                                                                    \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const T& a, const std::complex<T>& b) {           \
+    return complex_divide(std::complex<T>(a, 0), b);                                                                \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
+    numext::real_ref(a) += numext::real(b);                                                                         \
+    numext::imag_ref(a) += numext::imag(b);                                                                         \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) { \
+    numext::real_ref(a) -= numext::real(b);                                                                         \
+    numext::imag_ref(a) -= numext::imag(b);                                                                         \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
+    a = complex_multiply(a, b);                                                                                     \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
+    a = complex_divide(a, b);                                                                                       \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const std::complex<T>& b) {       \
+    return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);                                \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const T& b) {                     \
+    return numext::real(a) == b && numext::imag(a) == 0;                                                            \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const T& a, const std::complex<T>& b) {                     \
+    return a == numext::real(b) && 0 == numext::imag(b);                                                            \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {       \
+    return !(a == b);                                                                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const T& b) { return !(a == b); } \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const T& a, const std::complex<T>& b) { return !(a == b); }
 
 // Do not specialize for long double, since that reduces to double on device.
 EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
@@ -250,7 +223,6 @@
 
 #undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
 
-  
 }  // namespace complex_operator_detail
 
 EIGEN_USING_STD_COMPLEX_OPERATORS
diff --git a/Eigen/src/Core/arch/GPU/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
index f8191db..606215f 100644
--- a/Eigen/src/Core/arch/GPU/MathFunctions.h
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -21,86 +21,73 @@
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog<float4>(const float4& a) {
   return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
 }
 
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog<double2>(const double2& a) {
   using ::log;
   return make_double2(log(a.x), log(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog1p<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog1p<float4>(const float4& a) {
   return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
 }
 
-template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog1p<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog1p<double2>(const double2& a) {
   return make_double2(log1p(a.x), log1p(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexp<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexp<float4>(const float4& a) {
   return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexp<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp<double2>(const double2& a) {
   using ::exp;
   return make_double2(exp(a.x), exp(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexpm1<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexpm1<float4>(const float4& a) {
   return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexpm1<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexpm1<double2>(const double2& a) {
   return make_double2(expm1(a.x), expm1(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 psqrt<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt<float4>(const float4& a) {
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 psqrt<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psqrt<double2>(const double2& a) {
   using ::sqrt;
   return make_double2(sqrt(a.x), sqrt(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 prsqrt<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 prsqrt<float4>(const float4& a) {
   return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 prsqrt<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 prsqrt<double2>(const double2& a) {
   return make_double2(rsqrt(a.x), rsqrt(a.y));
 }
 
-
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_GPU_H
+#endif  // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 5c959ed..7900b0e 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -36,23 +36,29 @@
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
-template<> struct is_arithmetic<float4>  { enum { value = true }; };
-template<> struct is_arithmetic<double2> { enum { value = true }; };
+template <>
+struct is_arithmetic<float4> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<double2> {
+  enum { value = true };
+};
 
-template<> struct packet_traits<float> : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef float4 type;
   typedef float4 half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
+    size = 4,
 
-    HasDiv  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasDiv = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLGamma = 1,
@@ -74,18 +80,18 @@
   };
 };
 
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef double2 type;
   typedef double2 half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=2,
+    size = 2,
 
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLGamma = 1,
@@ -107,14 +113,37 @@
   };
 };
 
+template <>
+struct unpacket_traits<float4> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef float4 half;
+};
+template <>
+struct unpacket_traits<double2> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef double2 half;
+};
 
-template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
   return make_float4(from, from, from, from);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
   return make_double2(from, from);
 }
 
@@ -123,259 +152,254 @@
 // of the functions, while the latter can only deal with one of them.
 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
-                                                        const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
   return __int_as_float(__float_as_int(a) & __float_as_int(b));
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
-                                                         const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) &
-                              __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
-                                                       const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
   return __int_as_float(__float_as_int(a) | __float_as_int(b));
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
-                                                        const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) |
-                              __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
-                                                        const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
   return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
-                                                         const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) ^
-                              __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
-                                                           const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
   return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
-                                                            const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) &
-                              ~__double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
-                                                    const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
   return __int_as_float(a == b ? 0xffffffffu : 0u);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
-                                                     const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
   return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
-                                                    const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
   return __int_as_float(a < b ? 0xffffffffu : 0u);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
-                                                     const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
   return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a,
-                                                    const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
   return __int_as_float(a <= b ? 0xffffffffu : 0u);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a,
-                                                     const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
   return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
-                                                          const float4& b) {
-  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
-                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
-                                                            const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
   return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
-                                                         const float4& b) {
-  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
-                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
-                                                           const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
   return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
-                                                          const float4& b) {
-  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
-                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
-                                                            const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
   return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
-                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
+                     bitwise_andnot(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pandnot<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
   return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
-                     eq_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
-                     lt_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
+  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z),
-                     le_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
+  return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_eq<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
   return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_lt<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
   return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_le<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
   return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
 }
-#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#endif  // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG &&
+        // !EIGEN_COMP_NVCC)
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
-  return make_float4(a, a+1, a+2, a+3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a + 1, a + 2, a + 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
-  return make_double2(a, a+1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a + 1);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x+b.x, a.y+b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x + b.x, a.y + b.y);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x-b.x, a.y-b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x - b.x, a.y - b.y);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
   return make_float4(-a.x, -a.y, -a.z, -a.w);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
   return make_double2(-a.x, -a.y);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
+  return a;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x*b.x, a.y*b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
+  return a;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x/b.x, a.y/b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x * b.x, a.y * b.y);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x / b.x, a.y / b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
   return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
   return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
   return *reinterpret_cast<const float4*>(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
   return *reinterpret_cast<const double2*>(from);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
   return make_float4(from[0], from[1], from[2], from[3]);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
   return make_double2(from[0], from[1]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
   return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
   return make_double2(from[0], from[0]);
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
   *reinterpret_cast<float4*>(to) = from;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
   *reinterpret_cast<double2*>(to) = from;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
   to[0] = from.x;
   to[1] = from.y;
   to[2] = from.z;
   to[3] = from.w;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
   to[0] = from.x;
   to[1] = from.y;
 }
 
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
   return __ldg(reinterpret_cast<const float4*>(from));
@@ -383,7 +407,7 @@
   return make_float4(from[0], from[1], from[2], from[3]);
 #endif
 }
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
   return __ldg(reinterpret_cast<const double2*>(from));
@@ -392,93 +416,110 @@
 #endif
 }
 
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
 #endif
 }
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return make_double2(__ldg(from+0), __ldg(from+1));
+  return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
   return make_double2(from[0], from[1]);
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
-  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
-  return make_double2(from[0*stride], from[1*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0 * stride], from[1 * stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
-  to[stride*2] = from.z;
-  to[stride*3] = from.w;
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
+  to[stride * 2] = from.z;
+  to[stride * 3] = from.w;
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
 }
 
-template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
   return a.x;
 }
-template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
   return a.x;
 }
 
-template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
   return a.x + a.y + a.z + a.w;
 }
-template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
   return a.x + a.y;
 }
 
-template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
   return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
 }
-template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
   return fmax(a.x, a.y);
 }
 
-template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
   return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
 }
-template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
   return fmin(a.x, a.y);
 }
 
-template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
   return a.x * a.y * a.z * a.w;
 }
-template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
   return a.x * a.y;
 }
 
-template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
 }
-template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
   return make_double2(fabs(a.x), fabs(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
 }
-template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
   return make_double2(floor(a.x), floor(a.y));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<float4,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
   float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
   kernel.packet[1].x = tmp;
@@ -504,14 +545,13 @@
   kernel.packet[3].z = tmp;
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<double2,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
   double tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
   kernel.packet[1].x = tmp;
 }
 
-#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+#endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
 // Half-packet functions are not available on the host for CUDA 9.0-9.2, only
 // on device. There is no benefit to using them on the host anyways, since they are
@@ -519,41 +559,68 @@
 #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 typedef ulonglong2 Packet4h2;
-template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
-template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
+template <>
+struct unpacket_traits<Packet4h2> {
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4h2 half;
+};
+template <>
+struct is_arithmetic<Packet4h2> {
+  enum { value = true };
+};
 
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
-template<> struct is_arithmetic<half2> { enum { value = true }; };
+template <>
+struct unpacket_traits<half2> {
+  typedef Eigen::half type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef half2 half;
+};
+template <>
+struct is_arithmetic<half2> {
+  enum { value = true };
+};
 
-template<> struct packet_traits<Eigen::half> : default_packet_traits
-{
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
   typedef Packet4h2 type;
   typedef Packet4h2 half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=8,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasExp    = 1,
-    HasExpm1  = 1,
-    HasLog    = 1,
-    HasLog1p  = 1
+    size = 8,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasLog = 1,
+    HasLog1p = 1
   };
 };
 
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
   return __half2half2(from);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pset1<Packet4h2>(const Eigen::half& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   p_alias[0] = pset1<half2>(from);
@@ -569,59 +636,48 @@
   return *reinterpret_cast<const half2*>(from);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
-  return __halves2half2(from[0], from[1]);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
   return __halves2half2(from[0], from[0]);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
-                                                  const half2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
-                                                   const half2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }
 
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
-    const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
   // Input is guaranteed to be properly aligned.
   return __ldg(reinterpret_cast<const half2*>(from));
 #else
-  return __halves2half2(*(from+0), *(from+1));
+  return __halves2half2(*(from + 0), *(from + 1));
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
-    const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
+  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
-  return __halves2half2(*(from+0), *(from+1));
+  return __halves2half2(*(from + 0), *(from + 1));
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
-                                                    Index stride) {
-  return __halves2half2(from[0*stride], from[1*stride]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0 * stride], from[1 * stride]);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
-    Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = __low2half(from);
-  to[stride*1] = __high2half(from);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
+  to[stride * 0] = __low2half(from);
+  to[stride * 1] = __high2half(from);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
-  return __low2half(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
   half a1 = __low2half(a);
@@ -641,8 +697,7 @@
   return pset1<half2>(false_half);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<half2,2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
   __half a1 = __low2half(kernel.packet[0]);
   __half a2 = __high2half(kernel.packet[0]);
   __half b1 = __low2half(kernel.packet[1]);
@@ -660,9 +715,7 @@
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
-                                                    const half2& a,
-                                                    const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
   half mask_low = __low2half(mask);
   half mask_high = __high2half(mask);
   half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
@@ -670,8 +723,7 @@
   return __halves2half2(result_low, result_high);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
-                                                    const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   half false_half = half_impl::raw_uint16_to_half(0x0000u);
   half a1 = __low2half(a);
@@ -683,8 +735,7 @@
   return __halves2half2(eq1, eq2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
-                                                    const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   half false_half = half_impl::raw_uint16_to_half(0x0000u);
   half a1 = __low2half(a);
@@ -696,8 +747,7 @@
   return __halves2half2(eq1, eq2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a,
-                                                    const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   half false_half = half_impl::raw_uint16_to_half(0x0000u);
   half a1 = __low2half(a);
@@ -709,8 +759,7 @@
   return __halves2half2(eq1, eq2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
   half a1 = __low2half(a);
   half a2 = __high2half(a);
   half b1 = __low2half(b);
@@ -720,8 +769,7 @@
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
-                                                const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
   half a1 = __low2half(a);
   half a2 = __high2half(a);
   half b1 = __low2half(b);
@@ -731,8 +779,7 @@
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
   half a1 = __low2half(a);
   half a2 = __high2half(a);
   half b1 = __low2half(b);
@@ -742,8 +789,7 @@
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
-                                                    const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
   half a1 = __low2half(a);
   half a2 = __high2half(a);
   half b1 = __low2half(b);
@@ -753,8 +799,7 @@
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd2(a, b);
 #else
@@ -768,8 +813,7 @@
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hsub2(a, b);
 #else
@@ -795,8 +839,7 @@
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul2(a, b);
 #else
@@ -810,11 +853,9 @@
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
-                                                  const half2& b,
-                                                  const half2& c) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-   return __hfma2(a, b, c);
+  return __hfma2(a, b, c);
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
@@ -828,8 +869,7 @@
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __h2div(a, b);
 #else
@@ -843,8 +883,7 @@
 #endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -854,8 +893,7 @@
   return __halves2half2(r1, r2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
-                                                 const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -925,28 +963,15 @@
   return __floats2half2_rn(r1, r2);
 }
 
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 plog(const half2& a) {
-  return h2log(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
 
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 pexp(const half2& a) {
-  return h2exp(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
 
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 psqrt(const half2& a) {
-  return h2sqrt(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
 
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 prsqrt(const half2& a) {
-  return h2rsqrt(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
 
 #else
 
@@ -982,18 +1007,16 @@
   return __floats2half2_rn(r1, r2);
 }
 #endif
-} // namespace
+}  // namespace
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pload<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
   return *reinterpret_cast<const Packet4h2*>(from);
 }
 
 // unaligned load;
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploadu<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   p_alias[0] = ploadu(from + 0);
@@ -1004,8 +1027,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploaddup<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   p_alias[0] = ploaddup(from + 0);
@@ -1016,24 +1038,21 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
-    Eigen::half* to, const Packet4h2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
   *reinterpret_cast<Packet4h2*>(to) = from;
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
-    Eigen::half* to, const Packet4h2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
   const half2* from_alias = reinterpret_cast<const half2*>(&from);
-  pstoreu(to + 0,from_alias[0]);
-  pstoreu(to + 2,from_alias[1]);
-  pstoreu(to + 4,from_alias[2]);
-  pstoreu(to + 6,from_alias[3]);
+  pstoreu(to + 0, from_alias[0]);
+  pstoreu(to + 2, from_alias[1]);
+  pstoreu(to + 4, from_alias[2]);
+  pstoreu(to + 6, from_alias[3]);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
   Packet4h2 r;
   r = __ldg(reinterpret_cast<const Packet4h2*>(from));
@@ -1050,8 +1069,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   r_alias[0] = ploadt_ro_unaligned(from + 0);
@@ -1062,8 +1080,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
@@ -1074,8 +1091,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
-    Eigen::half* to, const Packet4h2& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
+                                                                            Index stride) {
   const half2* from_alias = reinterpret_cast<const half2*>(&from);
   pscatter(to + stride * 0, from_alias[0], stride);
   pscatter(to + stride * 2, from_alias[1], stride);
@@ -1084,14 +1101,12 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
   return pfirst(*(reinterpret_cast<const half2*>(&a)));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1103,8 +1118,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
-    const Packet4h2& /*a*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   return pset1<Packet4h2>(true_half);
 }
@@ -1115,9 +1129,9 @@
   return pset1<Packet4h2>(false_half);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
-    double* d_row0, double* d_row1, double* d_row2, double* d_row3,
-    double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
+                                                             double* d_row3, double* d_row4, double* d_row5,
+                                                             double* d_row6, double* d_row7) {
   double d_tmp;
   d_tmp = d_row0[1];
   d_row0[1] = d_row4[0];
@@ -1136,8 +1150,8 @@
   d_row7[0] = d_tmp;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
-    half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
+                                                            half2* f_row3) {
   half2 f_tmp;
   f_tmp = f_row0[1];
   f_row0[1] = f_row2[0];
@@ -1148,8 +1162,7 @@
   f_row3[0] = f_tmp;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose_half(half2& f0, half2& f1) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
   __half a1 = __low2half(f0);
   __half a2 = __high2half(f0);
   __half b1 = __low2half(f1);
@@ -1158,8 +1171,7 @@
   f1 = __halves2half2(a2, b2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h2,8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
   double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
   double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
   double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
@@ -1168,9 +1180,7 @@
   double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
   double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
   double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
-  ptranspose_double(d_row0, d_row1, d_row2, d_row3,
-                    d_row4, d_row5, d_row6, d_row7);
-
+  ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
 
   half2* f_row0 = reinterpret_cast<half2*>(d_row0);
   half2* f_row1 = reinterpret_cast<half2*>(d_row1);
@@ -1211,23 +1221,18 @@
   ptranspose_half(f_row0[1], f_row1[1]);
   ptranspose_half(f_row2[0], f_row3[0]);
   ptranspose_half(f_row2[1], f_row3[1]);
-
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plset<Packet4h2>(const Eigen::half& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
   p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
-  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
-                              __hadd(a, __float2half(3.0f)));
-  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
-                              __hadd(a, __float2half(5.0f)));
-  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
-                              __hadd(a, __float2half(7.0f)));
+  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
+  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
+  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
   return r;
 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   Packet4h2 r;
@@ -1235,8 +1240,8 @@
 
   half2 b = pset1<half2>(a);
   half2 c;
-  half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
-  half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
+  half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
+  half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
 
   c = __hadd2(b, half_offset0);
   r_alias[0] = plset(__low2half(c));
@@ -1261,9 +1266,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
-                   const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+                                                                   const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
@@ -1277,8 +1281,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1291,8 +1294,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1305,8 +1307,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1319,8 +1320,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1333,8 +1333,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1347,8 +1346,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1361,8 +1359,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1375,8 +1372,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1389,8 +1385,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1420,8 +1415,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1434,8 +1428,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
+                                                                 const Packet4h2& c) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1449,8 +1443,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1463,8 +1456,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1477,8 +1469,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1491,64 +1482,53 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
 
-  return predux(a_alias[0]) + predux(a_alias[1]) +
-         predux(a_alias[2]) + predux(a_alias[3]);
+  return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = __halves2half2(predux_max(a_alias[0]),
-                            predux_max(a_alias[1]));
-  half2 m1 = __halves2half2(predux_max(a_alias[2]),
-                            predux_max(a_alias[3]));
-  __half first  = predux_max(m0);
+  half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
+  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
+  __half first = predux_max(m0);
   __half second = predux_max(m1);
 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   return (__hgt(first, second) ? first : second);
 #else
-  float ffirst  = __half2float(first);
+  float ffirst = __half2float(first);
   float fsecond = __half2float(second);
-  return (ffirst > fsecond)? first: second;
+  return (ffirst > fsecond) ? first : second;
 #endif
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = __halves2half2(predux_min(a_alias[0]),
-                            predux_min(a_alias[1]));
-  half2 m1 = __halves2half2(predux_min(a_alias[2]),
-                            predux_min(a_alias[3]));
-  __half first  = predux_min(m0);
+  half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
+  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
+  __half first = predux_min(m0);
   __half second = predux_min(m1);
 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   return (__hlt(first, second) ? first : second);
 #else
-  float ffirst  = __half2float(first);
+  float ffirst = __half2float(first);
   float fsecond = __half2float(second);
-  return (ffirst < fsecond)? first: second;
+  return (ffirst < fsecond) ? first : second;
 #endif
 }
 
 // likely overflow/underflow
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
-    const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
-                                       pmul(a_alias[2], a_alias[3])));
+  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plog1p<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1560,8 +1540,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pexpm1<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1609,8 +1588,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-prsqrt<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1623,9 +1601,8 @@
 
 // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
 // the implementation of GPU half reduction.
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
-                                                        const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd2(a, b);
 #else
@@ -1639,9 +1616,8 @@
 #endif
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
-                                                        const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul2(a, b);
 #else
@@ -1655,9 +1631,8 @@
 #endif
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
-                                                        const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __h2div(a, b);
 #else
@@ -1671,9 +1646,8 @@
 #endif
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
-                                                        const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -1683,9 +1657,8 @@
   return __halves2half2(r1, r2);
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
-                                                        const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
@@ -1695,15 +1668,14 @@
   return __halves2half2(r1, r2);
 }
 
-#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 #undef EIGEN_GPU_HAS_LDG
 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-
-#endif // EIGEN_PACKET_MATH_GPU_H
+#endif  // EIGEN_PACKET_MATH_GPU_H
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index e223ca1..6bea9ac 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -20,196 +20,173 @@
 namespace tuple_impl {
 
 // Internal tuple implementation.
-template<size_t N, typename... Types>
+template <size_t N, typename... Types>
 class TupleImpl;
 
 // Generic recursive tuple.
-template<size_t N, typename T1, typename... Ts>
+template <size_t N, typename T1, typename... Ts>
 class TupleImpl<N, T1, Ts...> {
  public:
   // Tuple may contain Eigen types.
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  
+
   // Default constructor, enable if all types are default-constructible.
-  template<typename U1 = T1, typename EnableIf = std::enable_if_t<
-      std::is_default_constructible<U1>::value
-      && reduce_all<std::is_default_constructible<Ts>::value...>::value
-    >>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC
-  TupleImpl() : head_{}, tail_{} {}
- 
+  template <typename U1 = T1,
+            typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
+                                                 reduce_all<std::is_default_constructible<Ts>::value...>::value>>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+
   // Element constructor.
-  template<typename U1, typename... Us, 
-           // Only enable if...
-           typename EnableIf = std::enable_if_t<
-              // the number of input arguments match, and ...
-              sizeof...(Us) == sizeof...(Ts) && (
-                // this does not look like a copy/move constructor.
-                N > 1 || std::is_convertible<U1, T1>::value)
-           >>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC
-  TupleImpl(U1&& arg1, Us&&... args) 
-    : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
- 
-  // The first stored value. 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  T1& head() {
-    return head_;
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  const T1& head() const {
-    return head_;
-  }
-  
+  template <typename U1, typename... Us,
+            // Only enable if...
+            typename EnableIf = std::enable_if_t<
+                // the number of input arguments match, and ...
+                sizeof...(Us) == sizeof...(Ts) && (
+                                                      // this does not look like a copy/move constructor.
+                                                      N > 1 || std::is_convertible<U1, T1>::value)>>
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+      : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
+
+  // The first stored value.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return head_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return head_; }
+
   // The tail values.
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TupleImpl<N-1, Ts...>& tail() {
-    return tail_;
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  const TupleImpl<N-1, Ts...>& tail() const {
-    return tail_;
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void swap(TupleImpl& other) {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return tail_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return tail_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(TupleImpl& other) {
     using numext::swap;
     swap(head_, other.head_);
     swap(tail_, other.tail_);
   }
-  
-  template<typename... UTypes>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
+
+  template <typename... UTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
     head_ = other.head_;
     tail_ = other.tail_;
     return *this;
   }
-  
-  template<typename... UTypes>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
+
+  template <typename... UTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
     head_ = std::move(other.head_);
     tail_ = std::move(other.tail_);
     return *this;
   }
-  
+
  private:
   // Allow related tuples to reference head_/tail_.
-  template<size_t M, typename... UTypes>
+  template <size_t M, typename... UTypes>
   friend class TupleImpl;
- 
+
   T1 head_;
-  TupleImpl<N-1, Ts...> tail_;
+  TupleImpl<N - 1, Ts...> tail_;
 };
 
 // Empty tuple specialization.
-template<>
+template <>
 class TupleImpl<size_t(0)> {};
 
-template<typename TupleType>
+template <typename TupleType>
 struct is_tuple : std::false_type {};
 
-template<typename... Types>
-struct is_tuple< TupleImpl<sizeof...(Types), Types...> > : std::true_type {};
+template <typename... Types>
+struct is_tuple<TupleImpl<sizeof...(Types), Types...>> : std::true_type {};
 
 // Gets an element from a tuple.
-template<size_t Idx, typename T1, typename... Ts>
+template <size_t Idx, typename T1, typename... Ts>
 struct tuple_get_impl {
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
-  
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  ReturnType& run(TupleType& tuple) {
-    return tuple_get_impl<Idx-1, Ts...>::run(tuple.tail());
+
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+    return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  const ReturnType& run(const TupleType& tuple) {
-    return tuple_get_impl<Idx-1, Ts...>::run(tuple.tail());
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+    return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
   }
 };
 
 // Base case, getting the head element.
-template<typename T1, typename... Ts>
+template <typename T1, typename... Ts>
 struct tuple_get_impl<0, T1, Ts...> {
   using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
   using ReturnType = T1;
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  T1& run(TupleType& tuple) {
-    return tuple.head();
-  }
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
 
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  const T1& run(const TupleType& tuple) {
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) {
     return tuple.head();
   }
 };
 
 // Concatenates N Tuples.
-template<size_t NTuples, typename... Tuples>
+template <size_t NTuples, typename... Tuples>
 struct tuple_cat_impl;
 
-template<size_t NTuples, size_t N1, typename... Args1, size_t N2, typename... Args2, typename... Tuples>
+template <size_t NTuples, size_t N1, typename... Args1, size_t N2, typename... Args2, typename... Tuples>
 struct tuple_cat_impl<NTuples, TupleImpl<N1, Args1...>, TupleImpl<N2, Args2...>, Tuples...> {
   using TupleType1 = TupleImpl<N1, Args1...>;
   using TupleType2 = TupleImpl<N2, Args2...>;
   using MergedTupleType = TupleImpl<N1 + N2, Args1..., Args2...>;
-  
-  using ReturnType = typename tuple_cat_impl<NTuples-1, MergedTupleType, Tuples...>::ReturnType;
-  
+
+  using ReturnType = typename tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::ReturnType;
+
   // Uses the index sequences to extract and merge elements from tuple1 and tuple2,
   // then recursively calls again.
-  template<typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
-                 Tuple2&& tuple2, std::index_sequence<I2s...>,
-                 MoreTuples&&... tuples) {
-    return tuple_cat_impl<NTuples-1, MergedTupleType, Tuples...>::run(
+  template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1,
+                                                                              std::index_sequence<I1s...>,
+                                                                              Tuple2&& tuple2,
+                                                                              std::index_sequence<I2s...>,
+                                                                              MoreTuples&&... tuples) {
+    return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
         MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
                         tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
         std::forward<MoreTuples>(tuples)...);
   }
-  
+
   // Concatenates the first two tuples.
-  template<typename Tuple1, typename Tuple2, typename... MoreTuples>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2, MoreTuples&&... tuples) {
-    return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{},
-               std::forward<Tuple2>(tuple2), std::make_index_sequence<N2>{},
-               std::forward<MoreTuples>(tuples)...);
+  template <typename Tuple1, typename Tuple2, typename... MoreTuples>
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+                                                                              MoreTuples&&... tuples) {
+    return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
+               std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
   }
 };
 
 // Base case with a single tuple.
-template<size_t N, typename... Args>
-struct tuple_cat_impl<1, TupleImpl<N, Args...> > { 
+template <size_t N, typename... Args>
+struct tuple_cat_impl<1, TupleImpl<N, Args...>> {
   using ReturnType = TupleImpl<N, Args...>;
-  
-  template<typename Tuple1>
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ReturnType run(Tuple1&& tuple1) {
+
+  template <typename Tuple1>
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
     return tuple1;
   }
 };
 
 // Special case of no tuples.
-template<>
-struct tuple_cat_impl<0> { 
+template <>
+struct tuple_cat_impl<0> {
   using ReturnType = TupleImpl<0>;
-  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ReturnType run() {return ReturnType{}; }
+  static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
 };
 
 // For use in make_tuple, unwraps a reference_wrapper.
 template <typename T>
-struct unwrap_reference_wrapper { using type = T; };
- 
+struct unwrap_reference_wrapper {
+  using type = T;
+};
+
 template <typename T>
-struct unwrap_reference_wrapper<std::reference_wrapper<T> > { using type = T&; };
+struct unwrap_reference_wrapper<std::reference_wrapper<T>> {
+  using type = T&;
+};
 
 // For use in make_tuple, decays a type and unwraps a reference_wrapper.
 template <typename T>
@@ -220,11 +197,11 @@
 /**
  * Utility for determining a tuple's size.
  */
-template<typename Tuple>
+template <typename Tuple>
 struct tuple_size;
 
-template<typename... Types >
-struct tuple_size< TupleImpl<sizeof...(Types), Types...> > : std::integral_constant<size_t, sizeof...(Types)> {};
+template <typename... Types>
+struct tuple_size<TupleImpl<sizeof...(Types), Types...>> : std::integral_constant<size_t, sizeof...(Types)> {};
 
 /**
  * Gets an element of a tuple.
@@ -233,17 +210,15 @@
  * \param tuple the tuple.
  * \return a reference to the desired element.
  */
-template<size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename tuple_get_impl<Idx, Types...>::ReturnType&
-get(const TupleImpl<sizeof...(Types), Types...>& tuple) {
+template <size_t Idx, typename... Types>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+    const TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
 
-template<size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename tuple_get_impl<Idx, Types...>::ReturnType&
-get(TupleImpl<sizeof...(Types), Types...>& tuple) {
+template <size_t Idx, typename... Types>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+    TupleImpl<sizeof...(Types), Types...>& tuple) {
   return tuple_get_impl<Idx, Types...>::run(tuple);
 }
 
@@ -252,31 +227,27 @@
  * \param tuples ... list of tuples.
  * \return concatenated tuple.
  */
-template<typename... Tuples,
-          typename EnableIf = std::enable_if_t<
-            internal::reduce_all<
-              is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
+template <typename... Tuples, typename EnableIf = std::enable_if_t<
+                                  internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
 EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
-tuple_cat(Tuples&&... tuples) {
+    typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
+    tuple_cat(Tuples&&... tuples) {
   return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
 }
 
 /**
  * Tie arguments together into a tuple.
  */
-template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...> >
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
-    return ReturnType{args...};
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
+  return ReturnType{args...};
 }
 
 /**
  * Create a tuple of l-values with the supplied arguments.
  */
-template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...> >
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ReturnType make_tuple(Args&&... args) {
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
   return ReturnType{std::forward<Args>(args)...};
 }
 
@@ -284,15 +255,15 @@
  * Forward a set of arguments as a tuple.
  */
 template <typename... Args>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(
+    Args&&... args) {
   return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
 }
 
 /**
  * Alternative to std::tuple that can be used on device.
  */
-template<typename... Types>
+template <typename... Types>
 using tuple = TupleImpl<sizeof...(Types), Types...>;
 
 }  // namespace tuple_impl
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index aa89cd2..ae43f8e 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -22,61 +22,56 @@
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
 };
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
   float2 r1 = __half22float2(a);
   float2 r2 = __half22float2(b);
   return make_float4(r1.x, r1.y, r2.x, r2.y);
 }
 
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
   Packet4h2 r;
-  half2* r_alias=reinterpret_cast<half2*>(&r);
-  r_alias[0]=__floats2half2_rn(a.x,a.y);
-  r_alias[1]=__floats2half2_rn(a.z,a.w);
-  r_alias[2]=__floats2half2_rn(b.x,b.y);
-  r_alias[3]=__floats2half2_rn(b.z,b.w);
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = __floats2half2_rn(a.x, a.y);
+  r_alias[1] = __floats2half2_rn(a.z, a.w);
+  r_alias[2] = __floats2half2_rn(b.x, b.y);
+  r_alias[3] = __floats2half2_rn(b.z, b.w);
   return r;
 }
 
 template <>
 struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 2,
-    TgtCoeffRatio = 1
-  };
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
 };
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
   // Simply discard the second half of the input
   float4 r;
-  const half2* a_alias=reinterpret_cast<const half2*>(&a);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
   float2 r1 = __half22float2(a_alias[0]);
   float2 r2 = __half22float2(a_alias[1]);
-  r.x=static_cast<float>(r1.x);
-  r.y=static_cast<float>(r1.y);
-  r.z=static_cast<float>(r2.x);
-  r.w=static_cast<float>(r2.y);
+  r.x = static_cast<float>(r1.x);
+  r.y = static_cast<float>(r1.y);
+  r.z = static_cast<float>(r2.x);
+  r.w = static_cast<float>(r2.y);
   return r;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
   // Simply discard the second half of the input
   return __floats2half2_rn(a.x, a.y);
 }
 
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_GPU_H
+#endif  // EIGEN_TYPE_CASTING_GPU_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
index 25375a0..99dd3ae 100644
--- a/Eigen/src/Core/arch/HIP/hcc/math_constants.h
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -1,5 +1,5 @@
 /*
- * math_constants.h - 
+ * math_constants.h -
  *  HIP equivalent of the CUDA header of the same name
  */
 
@@ -8,16 +8,16 @@
 
 /* single precision constants */
 
-#define HIPRT_INF_F        __int_as_float(0x7f800000)
-#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
+#define HIPRT_INF_F __int_as_float(0x7f800000)
+#define HIPRT_NAN_F __int_as_float(0x7fffffff)
 #define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
 #define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
-#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
-#define HIPRT_ZERO_F       0.0f
-#define HIPRT_ONE_F        1.0f
+#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
+#define HIPRT_ZERO_F 0.0f
+#define HIPRT_ONE_F 1.0f
 
 /* double precision constants */
-#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
-#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
+#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
 
 #endif
diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
index 51f37fa..a159739 100644
--- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
@@ -9,31 +9,26 @@
 namespace internal {
 
 template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
-class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target,
-                  PacketSize_>
-    : public gebp_traits<float, float, ConjLhs_, ConjRhs_,
-                         Architecture::Generic, PacketSize_> {
+class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
  public:
   typedef Packet32qf AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero<Packet32qf>(p); }
 
   template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b,
-                                Packet32qf& c, Packet32f& /*tmp*/,
+  EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/,
                                 const LaneIdType&) const {
     c = pmadd_f32_to_qf32(a, b, c);
   }
 
   template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet32f& a,
-                                const QuadPacket<Packet32f>& b, Packet32qf& c,
-                                Packet32f& tmp, const LaneIdType& lane) const {
+  EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket<Packet32f>& b, Packet32qf& c, Packet32f& tmp,
+                                const LaneIdType& lane) const {
     madd(a, b.get(lane), c, tmp, lane);
   }
 
-  EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha,
-                               Packet32f& r) const {
+  EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const {
     r = pmadd_qf32_to_f32(c, alpha, r);
   }
 };
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index cc8722f..7c69f3b 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -18,21 +18,13 @@
 namespace Eigen {
 namespace internal {
 
-EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) {
-  return *((HVX_Vector*)mem);
-}
+EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); }
 
-EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) {
-  return *((HVX_UVector*)mem);
-}
+EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); }
 
-EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) {
-  *((HVX_Vector*)mem) = v;
-}
+EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; }
 
-EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) {
-  *((HVX_UVector*)mem) = v;
-}
+EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; }
 
 // Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
 // Wrap different vector type (float32, int32, etc) to different class with
@@ -106,24 +98,18 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a,
-                                              const Packet32f& b) {
-  return Packet32f::Create(
-      Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a,
-                                              const Packet32f& b) {
-  return Packet32f::Create(
-      Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a,
-                                              const Packet32f& b) {
-  return Packet32f::Create(
-      Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
 }
 
 template <>
@@ -153,8 +139,7 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a,
-                                             const Packet32f& b) {
+EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
   HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
   HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
   return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
@@ -175,16 +160,12 @@
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
   // Shuffle the 32-bit lanes.
-  HVX_VectorPair v_0_1_0 =
-      Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
-  HVX_VectorPair v_0_3_2 =
-      Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
 
   // Shuffle the 64-bit lanes.
-  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2),
-                                           HEXAGON_HVX_GET_V0(v_0_1_0), -8);
-  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2),
-                                           HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
 
   kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
   kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
@@ -194,174 +175,94 @@
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
   // Shuffle the 32-bit lanes.
-  HVX_VectorPair v_0_1_0 =
-      Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
-  HVX_VectorPair v_0_3_2 =
-      Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
-  HVX_VectorPair v_0_5_4 =
-      Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
-  HVX_VectorPair v_0_7_6 =
-      Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
-  HVX_VectorPair v_0_9_8 =
-      Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
-  HVX_VectorPair v_0_11_10 =
-      Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
-  HVX_VectorPair v_0_13_12 =
-      Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
-  HVX_VectorPair v_0_15_14 =
-      Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
-  HVX_VectorPair v_0_17_16 =
-      Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
-  HVX_VectorPair v_0_19_18 =
-      Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
-  HVX_VectorPair v_0_21_20 =
-      Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
-  HVX_VectorPair v_0_23_22 =
-      Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
-  HVX_VectorPair v_0_25_24 =
-      Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
-  HVX_VectorPair v_0_27_26 =
-      Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
-  HVX_VectorPair v_0_29_28 =
-      Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
-  HVX_VectorPair v_0_31_30 =
-      Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+  HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
+  HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
+  HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
+  HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
+  HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
+  HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
+  HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
+  HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
 
   // Shuffle the 64-bit lanes.
-  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2),
-                                           HEXAGON_HVX_GET_V0(v_0_1_0), -8);
-  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2),
-                                           HEXAGON_HVX_GET_V1(v_0_1_0), -8);
-  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6),
-                                           HEXAGON_HVX_GET_V0(v_0_5_4), -8);
-  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6),
-                                           HEXAGON_HVX_GET_V1(v_0_5_4), -8);
-  HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10),
-                                           HEXAGON_HVX_GET_V0(v_0_9_8), -8);
-  HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10),
-                                             HEXAGON_HVX_GET_V1(v_0_9_8), -8);
-  HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14),
-                                             HEXAGON_HVX_GET_V0(v_0_13_12), -8);
-  HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14),
-                                             HEXAGON_HVX_GET_V1(v_0_13_12), -8);
-  HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18),
-                                             HEXAGON_HVX_GET_V0(v_0_17_16), -8);
-  HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18),
-                                             HEXAGON_HVX_GET_V1(v_0_17_16), -8);
-  HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22),
-                                             HEXAGON_HVX_GET_V0(v_0_21_20), -8);
-  HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22),
-                                             HEXAGON_HVX_GET_V1(v_0_21_20), -8);
-  HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26),
-                                             HEXAGON_HVX_GET_V0(v_0_25_24), -8);
-  HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26),
-                                             HEXAGON_HVX_GET_V1(v_0_25_24), -8);
-  HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30),
-                                             HEXAGON_HVX_GET_V0(v_0_29_28), -8);
-  HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30),
-                                             HEXAGON_HVX_GET_V1(v_0_29_28), -8);
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
+  HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+  HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
+  HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+  HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
+  HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
+  HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
+  HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
+  HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
+  HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
+  HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
+  HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
+  HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
 
   // Shuffle the 128-bit lanes.
-  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4),
-                            HEXAGON_HVX_GET_V0(v_1_1_0), -16);
-  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4),
-                            HEXAGON_HVX_GET_V1(v_1_1_0), -16);
-  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6),
-                            HEXAGON_HVX_GET_V0(v_1_3_2), -16);
-  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6),
-                            HEXAGON_HVX_GET_V1(v_1_3_2), -16);
-  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12),
-                            HEXAGON_HVX_GET_V0(v_1_9_8), -16);
-  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12),
-                              HEXAGON_HVX_GET_V1(v_1_9_8), -16);
-  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14),
-                              HEXAGON_HVX_GET_V0(v_1_11_10), -16);
-  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14),
-                              HEXAGON_HVX_GET_V1(v_1_11_10), -16);
-  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20),
-                              HEXAGON_HVX_GET_V0(v_1_17_16), -16);
-  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20),
-                              HEXAGON_HVX_GET_V1(v_1_17_16), -16);
-  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22),
-                              HEXAGON_HVX_GET_V0(v_1_19_18), -16);
-  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22),
-                              HEXAGON_HVX_GET_V1(v_1_19_18), -16);
-  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28),
-                              HEXAGON_HVX_GET_V0(v_1_25_24), -16);
-  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28),
-                              HEXAGON_HVX_GET_V1(v_1_25_24), -16);
-  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30),
-                              HEXAGON_HVX_GET_V0(v_1_27_26), -16);
-  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30),
-                              HEXAGON_HVX_GET_V1(v_1_27_26), -16);
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
+  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
+  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
+  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
+  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
+  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
+  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
+  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
+  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
+  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
+  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
+  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
 
   // Shuffle the 256-bit lanes.
-  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8),
-                            HEXAGON_HVX_GET_V0(v_0_1_0), -32);
-  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8),
-                            HEXAGON_HVX_GET_V1(v_0_1_0), -32);
-  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10),
-                            HEXAGON_HVX_GET_V0(v_0_3_2), -32);
-  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10),
-                            HEXAGON_HVX_GET_V1(v_0_3_2), -32);
-  v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12),
-                            HEXAGON_HVX_GET_V0(v_0_5_4), -32);
-  v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12),
-                              HEXAGON_HVX_GET_V1(v_0_5_4), -32);
-  v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14),
-                              HEXAGON_HVX_GET_V0(v_0_7_6), -32);
-  v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14),
-                              HEXAGON_HVX_GET_V1(v_0_7_6), -32);
-  v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24),
-                              HEXAGON_HVX_GET_V0(v_0_17_16), -32);
-  v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24),
-                              HEXAGON_HVX_GET_V1(v_0_17_16), -32);
-  v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26),
-                              HEXAGON_HVX_GET_V0(v_0_19_18), -32);
-  v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26),
-                              HEXAGON_HVX_GET_V1(v_0_19_18), -32);
-  v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28),
-                              HEXAGON_HVX_GET_V0(v_0_21_20), -32);
-  v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28),
-                              HEXAGON_HVX_GET_V1(v_0_21_20), -32);
-  v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30),
-                              HEXAGON_HVX_GET_V0(v_0_23_22), -32);
-  v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30),
-                              HEXAGON_HVX_GET_V1(v_0_23_22), -32);
+  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+  v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
+  v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
+  v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
+  v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
+  v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
+  v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
+  v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
+  v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
+  v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
+  v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
+  v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
+  v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
 
   // Shuffle the 512-bit lanes.
-  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16),
-                            HEXAGON_HVX_GET_V0(v_1_1_0), -64);
-  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16),
-                            HEXAGON_HVX_GET_V1(v_1_1_0), -64);
-  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18),
-                            HEXAGON_HVX_GET_V0(v_1_3_2), -64);
-  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18),
-                            HEXAGON_HVX_GET_V1(v_1_3_2), -64);
-  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20),
-                            HEXAGON_HVX_GET_V0(v_1_5_4), -64);
-  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20),
-                              HEXAGON_HVX_GET_V1(v_1_5_4), -64);
-  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22),
-                              HEXAGON_HVX_GET_V0(v_1_7_6), -64);
-  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22),
-                              HEXAGON_HVX_GET_V1(v_1_7_6), -64);
-  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24),
-                              HEXAGON_HVX_GET_V0(v_1_9_8), -64);
-  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24),
-                              HEXAGON_HVX_GET_V1(v_1_9_8), -64);
-  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26),
-                              HEXAGON_HVX_GET_V0(v_1_11_10), -64);
-  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26),
-                              HEXAGON_HVX_GET_V1(v_1_11_10), -64);
-  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28),
-                              HEXAGON_HVX_GET_V0(v_1_13_12), -64);
-  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28),
-                              HEXAGON_HVX_GET_V1(v_1_13_12), -64);
-  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30),
-                              HEXAGON_HVX_GET_V0(v_1_15_14), -64);
-  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30),
-                              HEXAGON_HVX_GET_V1(v_1_15_14), -64);
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
+  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
+  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
+  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
+  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
+  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
+  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
+  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
+  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
+  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
+  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
+  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
+  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
 
   kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
   kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
@@ -401,12 +302,9 @@
 EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
   HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get());
   HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4);
-  HVX_Vector vsum_16 =
-      Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
-  HVX_Vector vsum_32 =
-      Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
-  HVX_Vector vsum_64 =
-      Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
+  HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
+  HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
+  HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
   return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64)));
 }
 
@@ -421,8 +319,7 @@
 EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
   HVX_Vector load = HVX_loadu(from);
   HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
-  HVX_VectorPair quad =
-      Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
+  HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
   return Packet32f::Create(HEXAGON_HVX_GET_V0(quad));
 }
 
@@ -463,8 +360,7 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a,
-                                      const Packet32f& b) {
+EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
   HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
   return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
 }
@@ -472,14 +368,10 @@
 template <typename Op>
 EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) {
   Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a);
-  Packet32f vredux_8 =
-      op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
-  Packet32f vredux_16 =
-      op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
-  Packet32f vredux_32 =
-      op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
-  Packet32f vredux_64 =
-      op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
+  Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
+  Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
+  Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
+  Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
   return pfirst(vredux_64);
 }
 
@@ -498,9 +390,9 @@
   return predux_generic(a, por<Packet32f>) != 0.0f;
 }
 
-static const float index_vsf[32] __attribute__((aligned(128))) = {
-    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+static const float index_vsf[32]
+    __attribute__((aligned(128))) = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
 
 template <>
 EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
@@ -514,30 +406,23 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a,
-                                                const Packet32qf& b) {
+EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
   return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get()));
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a,
-                                                const Packet32qf& b) {
+EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
   return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get()));
 }
 
 // Mixed float32 and qfloat32 operations.
-EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a,
-                                                 const Packet32f& b,
-                                                 const Packet32qf& c) {
-  return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(
-      Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
+EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) {
+  return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
 }
 
-EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a,
-                                                const Packet32f& b,
-                                                const Packet32f& c) {
-  return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(
-      Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
+EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) {
+  return Packet32f::Create(Q6_Vsf_equals_Vqf32(
+      Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
 }
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index b64bd8d..2d2fbbc 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -24,17 +24,13 @@
 
 //---------- float ----------
 struct Packet2cf {
-  EIGEN_STRONG_INLINE Packet2cf() {
-  }
-  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
-                                         const std::complex<float>& b) {
-    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a, const std::complex<float>& b) {
+    Packet4f t = {std::real(a), std::imag(a), std::real(b), std::imag(b)};
     v = t;
   }
-  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
-  }
-  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
-  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {}
   EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
     v = b.v;
     return *this;
@@ -61,33 +57,23 @@
     v = padd(v1, v2);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
-    return Packet2cf(*this) *= b;
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
   EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
     v = padd(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
-    return Packet2cf(*this) += b;
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
   EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
     v = psub(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
-    return Packet2cf(*this) -= b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
-    return pdiv_complex(Packet2cf(*this), b);
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { return pdiv_complex(Packet2cf(*this), b); }
   EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
     *this = Packet2cf(*this) / b;
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
-    return Packet2cf(pnegate(v));
-  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(pnegate(v)); }
 
   Packet4f v;
 };
@@ -126,7 +112,13 @@
 template <>
 struct unpacket_traits<Packet2cf> {
   typedef std::complex<float> type;
-  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet2cf half;
 };
 
@@ -135,8 +127,8 @@
   EIGEN_MSA_DEBUG;
 
   float f0 = from.real(), f1 = from.imag();
-  Packet4f v0 = { f0, f0, f0, f0 };
-  Packet4f v1 = { f1, f1, f1, f1 };
+  Packet4f v0 = {f0, f0, f0, f0};
+  Packet4f v1 = {f1, f1, f1, f1};
   return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
 }
 
@@ -225,32 +217,29 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
-                                                      const Packet2cf& from) {
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
   EIGEN_MSA_DEBUG;
 
   EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
-                                                       const Packet2cf& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
   EIGEN_MSA_DEBUG;
 
   EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
-    const std::complex<float>* from, Index stride) {
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
   EIGEN_MSA_DEBUG;
 
   return Packet2cf(from[0 * stride], from[1 * stride]);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
-                                                                       const Packet2cf& from,
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
                                                                        Index stride) {
   EIGEN_MSA_DEBUG;
 
@@ -300,8 +289,7 @@
 EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   EIGEN_MSA_DEBUG;
 
-  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
-                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
 }
 
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
@@ -321,39 +309,33 @@
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   EIGEN_MSA_DEBUG;
 
-  Packet4f tmp =
-      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
-  kernel.packet[0].v =
-      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  Packet4f tmp = (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v = (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
   kernel.packet[1].v = tmp;
 }
 
 template <>
 EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
                                      const Packet2cf& elsePacket) {
-  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
-                                               (Packet2d)elsePacket.v);
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v, (Packet2d)elsePacket.v);
 }
 
 //---------- double ----------
 
 struct Packet1cd {
-  EIGEN_STRONG_INLINE Packet1cd() {
-  }
+  EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
     v[0] = std::real(a);
     v[1] = std::imag(a);
   }
-  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
-  }
-  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
-  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {}
   EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
     v = b.v;
     return *this;
   }
   EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
-    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+    static const v2u64 p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
     return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
   }
   EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
@@ -375,23 +357,17 @@
     v = padd(v1, v2);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
-    return Packet1cd(*this) *= b;
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
   EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
     v = padd(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
-    return Packet1cd(*this) += b;
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
   EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
     v = psub(v, b.v);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
-    return Packet1cd(*this) -= b;
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
   EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
     *this *= b.conjugate();
     Packet2d s = pmul<Packet2d>(b.v, b.v);
@@ -399,12 +375,8 @@
     v = pdiv(v, s);
     return *this;
   }
-  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
-    return Packet1cd(*this) /= b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
-    return Packet1cd(pnegate(v));
-  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { return Packet1cd(*this) /= b; }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(pnegate(v)); }
 
   Packet2d v;
 };
@@ -439,7 +411,13 @@
 template <>
 struct unpacket_traits<Packet1cd> {
   typedef std::complex<double> type;
-  enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet1cd half;
 };
 
@@ -535,16 +513,14 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
-                                                       const Packet1cd& from) {
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_MSA_DEBUG;
 
   EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
-                                                        const Packet1cd& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
   EIGEN_MSA_DEBUG;
 
   EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
@@ -558,8 +534,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
-    const std::complex<double>* from, Index stride __attribute__((unused))) {
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride __attribute__((unused))) {
   EIGEN_MSA_DEBUG;
 
   Packet1cd res;
@@ -569,10 +545,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
-                                                                        const Packet1cd& from,
-                                                                        Index stride
-                                                                        __attribute__((unused))) {
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride __attribute__((unused))) {
   EIGEN_MSA_DEBUG;
 
   pstore(to, from);
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
index 3e77329..f68d254 100644
--- a/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -34,8 +34,7 @@
 namespace internal {
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-plog<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog<Packet4f>(const Packet4f& _x) {
   static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
   static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
   static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
@@ -122,8 +121,7 @@
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-pexp<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
   // Limiting single-precision pexp's argument to [-128, +128] lets pexp
   // reach 0 and INFINITY naturally.
   static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
@@ -143,10 +141,8 @@
   Packet4f x = _x;
 
   // Clamp x.
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
-                                     (v16u8)p4f_exp_lo);
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
-                                     (v16u8)p4f_exp_hi);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, (v16u8)p4f_exp_hi);
 
   // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
   Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
@@ -175,8 +171,7 @@
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& _x) {
   static EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
   static EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
   // The monomial coefficients of the numerator polynomial (odd).
@@ -198,8 +193,7 @@
 
   // Clamp the inputs to the range [-9, 9] since anything outside
   // this range is -/+1.0f in single-precision.
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
-                                     (v16u8)p4f_tanh_hi);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, (v16u8)p4f_tanh_hi);
 
   // Since the polynomials are odd/even, we need x**2.
   Packet4f x2 = pmul(x, x);
@@ -264,7 +258,7 @@
   // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
   // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
   Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
-  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);  // bclri = bit-clear
   y = __builtin_msa_ffint_s_w(y_int2);
 
   // Compute the sign to apply to the polynomial.
@@ -308,25 +302,22 @@
 
   // Update the sign.
   sign_mask = pxor(sign_mask, (Packet4i)y);
-  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);  // binsli = bit-insert-left
   return y;
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-psin<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin<Packet4f>(const Packet4f& x) {
   return psincos_inner_msa_float</* sine */ true>(x);
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-pcos<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos<Packet4f>(const Packet4f& x) {
   return psincos_inner_msa_float</* sine */ false>(x);
 }
 
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d
-pexp<Packet2d>(const Packet2d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
   // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
   // reach 0 and INFINITY naturally.
   static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
@@ -348,10 +339,8 @@
   Packet2d x = _x;
 
   // Clamp x.
-  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
-                                     (v16u8)p2d_exp_lo);
-  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
-                                     (v16u8)p2d_exp_hi);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, (v16u8)p2d_exp_hi);
 
   // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
   Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index b36f024..c1843c3 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -54,9 +54,9 @@
 typedef v4i32 Packet4i;
 typedef v4u32 Packet4ui;
 
-#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
-#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
-#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
 
 inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
@@ -115,14 +115,26 @@
 template <>
 struct unpacket_traits<Packet4f> {
   typedef float type;
-  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet4f half;
 };
 
 template <>
 struct unpacket_traits<Packet4i> {
   typedef int32_t type;
-  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet4i half;
 };
 
@@ -130,7 +142,7 @@
 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
   EIGEN_MSA_DEBUG;
 
-  Packet4f v = { from, from, from, from };
+  Packet4f v = {from, from, from, from};
   return v;
 }
 
@@ -146,7 +158,7 @@
   EIGEN_MSA_DEBUG;
 
   float f = *from;
-  Packet4f v = { f, f, f, f };
+  Packet4f v = {f, f, f, f};
   return v;
 }
 
@@ -175,7 +187,7 @@
 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
   EIGEN_MSA_DEBUG;
 
-  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+  static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
   return padd(pset1<Packet4f>(a), countdown);
 }
 
@@ -183,7 +195,7 @@
 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
   EIGEN_MSA_DEBUG;
 
-  static const Packet4i countdown = { 0, 1, 2, 3 };
+  static const Packet4i countdown = {0, 1, 2, 3};
   return padd(pset1<Packet4i>(a), countdown);
 }
 
@@ -411,8 +423,8 @@
   EIGEN_MSA_DEBUG;
 
   float f0 = from[0], f1 = from[1];
-  Packet4f v0 = { f0, f0, f0, f0 };
-  Packet4f v1 = { f1, f1, f1, f1 };
+  Packet4f v0 = {f0, f0, f0, f0};
+  Packet4f v1 = {f1, f1, f1, f1};
   return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
 }
 
@@ -421,8 +433,8 @@
   EIGEN_MSA_DEBUG;
 
   int32_t i0 = from[0], i1 = from[1];
-  Packet4i v0 = { i0, i0, i0, i0 };
-  Packet4i v1 = { i1, i1, i1, i1 };
+  Packet4i v0 = {i0, i0, i0, i0};
+  Packet4i v1 = {i1, i1, i1, i1};
   return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
 }
 
@@ -459,7 +471,7 @@
   EIGEN_MSA_DEBUG;
 
   float f = *from;
-  Packet4f v = { f, f, f, f };
+  Packet4f v = {f, f, f, f};
   v[1] = from[stride];
   v[2] = from[2 * stride];
   v[3] = from[3 * stride];
@@ -471,7 +483,7 @@
   EIGEN_MSA_DEBUG;
 
   int32_t i = *from;
-  Packet4i v = { i, i, i, i };
+  Packet4i v = {i, i, i, i};
   v[1] = from[stride];
   v[2] = from[2 * stride];
   v[3] = from[3 * stride];
@@ -479,8 +491,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
-                                                        Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
   EIGEN_MSA_DEBUG;
 
   *to = from[0];
@@ -493,8 +504,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
-                                                          Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) {
   EIGEN_MSA_DEBUG;
 
   *to = from[0];
@@ -572,7 +582,6 @@
   return s[0];
 }
 
-
 template <>
 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
   EIGEN_MSA_DEBUG;
@@ -618,8 +627,7 @@
 #endif
   // Continue with min computation.
   Packet4f v = __builtin_msa_fmin_w(a, swapped);
-  v = __builtin_msa_fmin_w(
-      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
 #if !EIGEN_FAST_MATH
   // Based on the mask select between v and 4 qNaNs.
   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
@@ -653,8 +661,7 @@
 #endif
   // Continue with max computation.
   Packet4f v = __builtin_msa_fmax_w(a, swapped);
-  v = __builtin_msa_fmax_w(
-      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
 #if !EIGEN_FAST_MATH
   // Based on the mask select between v and 4 qNaNs.
   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
@@ -801,8 +808,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
                                     const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
-                       ifPacket.select[3] };
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
   return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
 }
@@ -810,8 +816,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
                                     const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
-                       ifPacket.select[3] };
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
   return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
 }
@@ -822,9 +827,9 @@
 typedef v2i64 Packet2l;
 typedef v2u64 Packet2ul;
 
-#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
-#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
-#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
 
 inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
   os << "[ " << value[0] << ", " << value[1] << " ]";
@@ -864,7 +869,13 @@
 template <>
 struct unpacket_traits<Packet2d> {
   typedef double type;
-  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet2d half;
 };
 
@@ -872,7 +883,7 @@
 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   EIGEN_MSA_DEBUG;
 
-  Packet2d value = { from, from };
+  Packet2d value = {from, from};
   return value;
 }
 
@@ -887,7 +898,7 @@
 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
   EIGEN_MSA_DEBUG;
 
-  static const Packet2d countdown = { 0.0, 1.0 };
+  static const Packet2d countdown = {0.0, 1.0};
   return padd(pset1<Packet2d>(a), countdown);
 }
 
@@ -1011,7 +1022,7 @@
 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
   EIGEN_MSA_DEBUG;
 
-  Packet2d value = { *from, *from };
+  Packet2d value = {*from, *from};
   return value;
 }
 
@@ -1041,8 +1052,7 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
-                                                         Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
   EIGEN_MSA_DEBUG;
 
   *to = from[0];
@@ -1221,7 +1231,7 @@
 template <>
 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
                                     const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
   Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
   return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
 }
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 98b76da..8240847 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -18,70 +18,64 @@
 
 namespace internal {
 
-inline uint32x4_t p4ui_CONJ_XOR()
-{
+inline uint32x4_t p4ui_CONJ_XOR() {
 // See bug 1325, clang fails to call vld1q_u64.
 #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
-  uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+  uint32x4_t ret = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
   return ret;
 #else
-  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-  return vld1q_u32( conj_XOR_DATA );
+  static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  return vld1q_u32(conj_XOR_DATA);
 #endif
 }
 
-inline uint32x2_t p2ui_CONJ_XOR()
-{
-  static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
-  return vld1_u32( conj_XOR_DATA );
+inline uint32x2_t p2ui_CONJ_XOR() {
+  static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000};
+  return vld1_u32(conj_XOR_DATA);
 }
 
 //---------- float ----------
 
-struct Packet1cf
-{
+struct Packet1cf {
   EIGEN_STRONG_INLINE Packet1cf() {}
   EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
   Packet2f v;
 };
-struct Packet2cf
-{
+struct Packet2cf {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
   Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
   typedef Packet1cf half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasMul       = 1,
-    HasDiv       = 1,
-    HasNegate    = 1,
-    HasSqrt      = 1,
-    HasAbs       = 0,
-    HasAbs2      = 0,
-    HasMin       = 0,
-    HasMax       = 0,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet1cf>
-{
+template <>
+struct unpacket_traits<Packet1cf> {
   typedef std::complex<float> type;
   typedef Packet1cf half;
   typedef Packet2f as_real;
-  enum
-  {
+  enum {
     size = 1,
     alignment = Aligned16,
     vectorizable = true,
@@ -89,13 +83,12 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet2cf>
-{
+template <>
+struct unpacket_traits<Packet2cf> {
   typedef std::complex<float> type;
   typedef Packet1cf half;
   typedef Packet4f as_real;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -104,45 +97,65 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
-{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
-{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcast<float, Packet1cf>(const float& a) {
+  return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f, Packet2cf>(const Packet2f& a) {
+  return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
-{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from) {
+  return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
   return Packet2cf(vcombine_f32(r64, r64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(padd<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(psub<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) {
+  return Packet1cf(pnegate<Packet2f>(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate<Packet4f>(a.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) {
   const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
   return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
   const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
   Packet2f v1, v2;
 
   // Get the real values of a | a1_re | a1_re |
@@ -160,8 +173,8 @@
   // Add and return the result
   return Packet1cf(vadd_f32(v1, v2));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   Packet4f v1, v2;
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
@@ -180,8 +193,8 @@
   return Packet2cf(vaddq_f32(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) {
   // Compare real and imaginary parts of a and b to get the mask vector:
   // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
   Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
@@ -191,8 +204,8 @@
   // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
   return Packet1cf(pand<Packet2f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
   // Compare real and imaginary parts of a and b to get the mask vector:
   // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
   Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
@@ -203,129 +216,178 @@
   return Packet2cf(pand<Packet4f>(eq, eq_swapped));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
-{ return pset1<Packet1cf>(*from); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
-{ return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
-    const std::complex<float>* from, Index stride)
-{
-  const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
-  return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
+template <>
+EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
 }
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
-    const std::complex<float>* from, Index stride)
-{
-  Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
-  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
-  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
-  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from) {
+  return pset1<Packet1cf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  const Packet2f tmp = vdup_n_f32(std::real(from[0 * stride]));
+  return Packet1cf(vset_lane_f32(std::imag(from[0 * stride]), tmp, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  Packet4f res = vdupq_n_f32(std::real(from[0 * stride]));
+  res = vsetq_lane_f32(std::imag(from[0 * stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1 * stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1 * stride]), res, 3);
   return Packet2cf(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
-    std::complex<float>* to, const Packet1cf& from, Index stride)
-{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
-    std::complex<float>* to, const Packet2cf& from, Index stride)
-{
-  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
-  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(std::complex<float>* to, const Packet1cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride * 1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a) {
   EIGEN_ALIGN16 std::complex<float> x;
   vst1_f32(reinterpret_cast<float*>(&x), a.v);
   return x;
 }
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
   EIGEN_ALIGN16 std::complex<float> x[2];
   vst1q_f32(reinterpret_cast<float*>(x), a.v);
   return x[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
-{ return Packet1cf(vrev64_f32(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{ return Packet2cf(vrev64q_f32(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a) {
+  return Packet1cf(vrev64_f32(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(vrev64q_f32(a.v));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a) {
   std::complex<float> s;
-  vst1_f32((float *)&s, a.v);
+  vst1_f32((float*)&s, a.v);
   return s;
 }
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
   std::complex<float> s;
   vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
   return s;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a) {
   std::complex<float> s;
-  vst1_f32((float *)&s, a.v);
+  vst1_f32((float*)&s, a.v);
   return s;
 }
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   float32x2_t a1, a2, v1, v2, prod;
   std::complex<float> s;
 
   a1 = vget_low_f32(a.v);
   a2 = vget_high_f32(a.v);
-   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vdup_lane_f32(a1, 0);
   // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vdup_lane_f32(a1, 1);
@@ -345,31 +407,32 @@
   return s;
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf, Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
   return pdiv_complex(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   return pdiv_complex(a, b);
 }
 
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
   kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
   return psqrt_complex<Packet1cf>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
   return psqrt_complex<Packet2cf>(a);
 }
 
@@ -378,84 +441,93 @@
 
 // See bug 1325, clang fails to call vld1q_u64.
 #if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CPE
-  static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
 #else
-  const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
-  static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
+const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
+static uint64x2_t p2ul_CONJ_XOR = vld1q_u64(p2ul_conj_XOR_DATA);
 #endif
 
-struct Packet1cd
-{
+struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
   Packet2d v;
 };
 
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet1cd type;
   typedef Packet1cd half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet1cd>
-{
+template <>
+struct unpacket_traits<Packet1cd> {
   typedef std::complex<double> type;
   typedef Packet1cd half;
   typedef Packet2d as_real;
-  enum
-  {
-    size=1,
-    alignment=Aligned16,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
   /* here we really have to use unaligned loads :( */
   return ploadu<Packet1cd>(&from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(padd<Packet2d>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(psub<Packet2d>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
-{ return Packet1cd(pnegate<Packet2d>(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate<Packet2d>(a.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   Packet2d v1, v2;
 
   // Get the real values of a
@@ -474,8 +546,8 @@
   return Packet1cd(vaddq_f64(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
   // Compare real and imaginary parts of a and b to get the mask vector:
   // [re(a)==re(b), im(a)==im(b)]
   Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
@@ -486,81 +558,109 @@
   return Packet1cd(pand<Packet2d>(eq, eq_swapped));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
-{ return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v);
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr));
+}
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
-    const std::complex<double>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride) {
   Packet2d res = pset1<Packet2d>(0.0);
-  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
-  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+  res = vsetq_lane_f64(std::real(from[0 * stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0 * stride]), res, 1);
   return Packet1cd(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
-    std::complex<double>* to, const Packet1cd& from, Index stride)
-{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride) {
+  to[stride * 0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   return pdiv_complex(a, b);
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{ return Packet1cd(preverse(Packet2d(x.v))); }
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
   Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
   kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
   return psqrt_complex<Packet1cd>(a);
 }
 
-#endif // EIGEN_ARCH_ARM64
+#endif  // EIGEN_ARCH_ARM64
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_NEON_H
+#endif  // EIGEN_COMPLEX_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index 48410c5..4ecf7d1 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -9,38 +9,28 @@
 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
 // Here we specialize gebp_traits to eliminate these register spills.
 // See #2138.
-template<>
-struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
-  { 
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
     // This volatile inline ASM both acts as a barrier to prevent reordering,
     // as well as enforces strict register use.
-    asm volatile(
-      "vmla.f32 %q[r], %q[c], %q[alpha]"
-      : [r] "+w" (r)
-      : [c] "w" (c),
-        [alpha] "w" (alpha)
-      : );
+    asm volatile("vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w"(r) : [c] "w"(c), [alpha] "w"(alpha) :);
   }
 
   template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
-                                Packet4f& c, Packet4f&,
-                                const LaneIdType&) const {
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f&, const LaneIdType&) const {
     acc(a, b, c);
   }
-  
+
   template <typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
-                                Packet4f& c, Packet4f& tmp,
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, Packet4f& c, Packet4f& tmp,
                                 const LaneIdType& lane) const {
     madd(a, b.get(lane), c, tmp, lane);
   }
 };
 
-#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+#endif  // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
 
 #if EIGEN_ARCH_ARM64
 
@@ -48,139 +38,139 @@
 #define EIGEN_NEON_GEBP_NR 8
 #endif
 
-template<>
-struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
   typedef float RhsPacket;
   typedef float32x4_t RhsPacketx4;
   enum { nr = EIGEN_NEON_GEBP_NR };
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const {
-    dest = *b;
-  }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest = vld1q_f32(b);
-  }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
 
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
     c = vfmaq_n_f32(c, a, b);
   }
   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
 
  private:
-  template<int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon
-         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
+    if (LaneID == 0)
+      asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 1)
+      asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 2)
+      asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 3)
+      asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(c) : "w"(a), "w"(b) :);
+#else
     c = vfmaq_laneq_f32(c, a, b, LaneID);
-    #endif
+#endif
   }
 };
 
-
-template<>
-struct gebp_traits <double,double,false,false,Architecture::NEON>
- : gebp_traits<double,double,false,false,Architecture::Generic>
-{
+template <>
+struct gebp_traits<double, double, false, false, Architecture::NEON>
+    : gebp_traits<double, double, false, false, Architecture::Generic> {
   typedef double RhsPacket;
   enum { nr = EIGEN_NEON_GEBP_NR };
   struct RhsPacketx4 {
     float64x2_t B_0, B_1;
   };
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
     dest.B_0 = vld1q_f64(b);
-    dest.B_1 = vld1q_f64(b+2);
+    dest.B_1 = vld1q_f64(b + 2);
   }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
 
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
     c = vfmaq_n_f64(c, a, b);
   }
 
   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
 
  private:
   template <int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon
-         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    #else
-         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
-    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
-    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
-    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
-    #endif
+    if (LaneID == 0)
+      asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+    else if (LaneID == 1)
+      asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+    else if (LaneID == 2)
+      asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+    else if (LaneID == 3)
+      asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+#else
+    if (LaneID == 0)
+      c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if (LaneID == 1)
+      c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if (LaneID == 2)
+      c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if (LaneID == 3)
+      c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+#endif
   }
 };
 
@@ -190,68 +180,64 @@
 // through a costly dup in gcc compiler.
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
 
-template<>
-struct gebp_traits <half,half,false,false,Architecture::NEON>
- : gebp_traits<half,half,false,false,Architecture::Generic>
-{
+template <>
+struct gebp_traits<half, half, false, false, Architecture::NEON>
+    : gebp_traits<half, half, false, false, Architecture::Generic> {
   typedef half RhsPacket;
   typedef float16x4_t RhsPacketx4;
   typedef float16x4_t PacketHalf;
   enum { nr = EIGEN_NEON_GEBP_NR };
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest = vld1_f16((const __fp16 *)b);
-  }
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1_f16((const __fp16*)b); }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
 
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
 
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const
-  {
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const {
     // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
     // using a single scalar value.
     eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
     c = vfmaq_n_f16(c, a, b);
   }
-  EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
+  EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
     c = vfma_n_f16(c, a, b);
   }
 
   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
+
  private:
-  template<int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
     c = vfmaq_lane_f16(c, a, b, LaneID);
   }
 };
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
-#endif // EIGEN_ARCH_ARM64
+#endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
+#endif  // EIGEN_ARCH_ARM64
 
 }  // namespace internal
 }  // namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 8611810..3d2e7bd 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -20,21 +20,18 @@
 
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet4hf ptanh<Packet4hf>(const Packet4hf& x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet4hf ptanh<Packet4hf>(const Packet4hf& x) {
   // Convert to float, call the float ptanh, and then convert back.
   return vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(x)));
 }
 
 template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
   // Convert each 4 halfs to float, call the float ptanh, and then convert back.
-  return vcombine_f16(
-    vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
-    vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
+  return vcombine_f16(vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
+                      vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
 }
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
 BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
@@ -63,8 +60,8 @@
 
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_NEON_H
+#endif  // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index e70f8b0..4e3a14d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -41,24 +41,24 @@
 // are aliases to the same underlying type __n128.
 // We thus have to wrap them to make them different C++ types.
 // (See also bug 1428)
-typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
-typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
-typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
-typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
-typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
-typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
-typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
-typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
-typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
-typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
-typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
-typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
-typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
-typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
-typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
-typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
-typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
-typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
+typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
+typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
+typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
 
 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
   float from[4] = {a, b, c, d};
@@ -72,405 +72,380 @@
 
 #else
 
-typedef float32x2_t                          Packet2f;
-typedef float32x4_t                          Packet4f;
-typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
-typedef int8x8_t                             Packet8c;
-typedef int8x16_t                            Packet16c;
-typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
-typedef uint8x8_t                            Packet8uc;
-typedef uint8x16_t                           Packet16uc;
-typedef int16x4_t                            Packet4s;
-typedef int16x8_t                            Packet8s;
-typedef uint16x4_t                           Packet4us;
-typedef uint16x8_t                           Packet8us;
-typedef int32x2_t                            Packet2i;
-typedef int32x4_t                            Packet4i;
-typedef uint32x2_t                           Packet2ui;
-typedef uint32x4_t                           Packet4ui;
-typedef int64x2_t                            Packet2l;
-typedef uint64x2_t                           Packet2ul;
+typedef float32x2_t Packet2f;
+typedef float32x4_t Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef int8x8_t Packet8c;
+typedef int8x16_t Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef uint8x8_t Packet8uc;
+typedef uint8x16_t Packet16uc;
+typedef int16x4_t Packet4s;
+typedef int16x8_t Packet8s;
+typedef uint16x4_t Packet4us;
+typedef uint16x8_t Packet8us;
+typedef int32x2_t Packet2i;
+typedef int32x4_t Packet4i;
+typedef uint32x2_t Packet2ui;
+typedef uint32x4_t Packet4ui;
+typedef int64x2_t Packet2l;
+typedef uint64x2_t Packet2ul;
 
 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
 
-#endif // EIGEN_COMP_MSVC_STRICT
+#endif  // EIGEN_COMP_MSVC_STRICT
 
-EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
   const float* a = reinterpret_cast<const float*>(&m);
-  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
   return res;
 }
 
 // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
-// to enable a shared implementation for fast inversion of matrices of size 4. 
-template<bool interleave> 
-EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
-{
+// to enable a shared implementation for fast inversion of matrices of size 4.
+template <bool interleave>
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
-template<> 
-EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
-EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
+  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
+}
 
-EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
-{ 
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
   return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
-{ 
-  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
+  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
-{
-  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
-{
-  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
-{
-  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
-{
-  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
 }
-#define vec4f_duplane(a, p) \
-  Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
+#define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
 
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
   const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
 
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
 #if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
-  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
-  // prefetch instructions there are too detailed for __builtin_prefetch to map
-  // meaningfully to them.
-  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+// __builtin_prefetch tends to do nothing on ARM64 compilers because the
+// prefetch instructions there are too detailed for __builtin_prefetch to map
+// meaningfully to them.
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
-  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
-  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
 #elif EIGEN_ARCH_ARM
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
 #else
-  // by default no explicit prefetching
-  #define EIGEN_ARM_PREFETCH(ADDR)
+// by default no explicit prefetching
+#define EIGEN_ARM_PREFETCH(ADDR)
 #endif
 
 template <>
-struct packet_traits<float> : default_packet_traits
-{
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
   typedef Packet2f half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
 
-    HasDiv   = 1,
+    HasDiv = 1,
     HasFloor = 1,
     HasCeil = 1,
     HasRint = 1,
 
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasACos  = 1,
-    HasASin  = 1,
-    HasATan  = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
     HasATanh = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf  = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
     HasNdtri = 0
   };
 };
 
 template <>
-struct packet_traits<int8_t> : default_packet_traits
-{
+struct packet_traits<int8_t> : default_packet_traits {
   typedef Packet16c type;
   typedef Packet8c half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0
+    HasBlend = 0
   };
 };
 
 template <>
-struct packet_traits<uint8_t> : default_packet_traits
-{
+struct packet_traits<uint8_t> : default_packet_traits {
   typedef Packet16uc type;
   typedef Packet8uc half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 16,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
 
     HasSqrt = 1
   };
 };
 
 template <>
-struct packet_traits<int16_t> : default_packet_traits
-{
+struct packet_traits<int16_t> : default_packet_traits {
   typedef Packet8s type;
   typedef Packet4s half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0
+    HasBlend = 0
   };
 };
 
 template <>
-struct packet_traits<uint16_t> : default_packet_traits
-{
+struct packet_traits<uint16_t> : default_packet_traits {
   typedef Packet8us type;
   typedef Packet4us half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 8,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
     HasSqrt = 1
   };
 };
 
 template <>
-struct packet_traits<int32_t> : default_packet_traits
-{
+struct packet_traits<int32_t> : default_packet_traits {
   typedef Packet4i type;
   typedef Packet2i half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0
+    HasBlend = 0
   };
 };
 
 template <>
-struct packet_traits<uint32_t> : default_packet_traits
-{
+struct packet_traits<uint32_t> : default_packet_traits {
   typedef Packet4ui type;
   typedef Packet2ui half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
 
     HasSqrt = 1
   };
 };
 
 template <>
-struct packet_traits<int64_t> : default_packet_traits
-{
+struct packet_traits<int64_t> : default_packet_traits {
   typedef Packet2l type;
   typedef Packet2l half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0
+    HasBlend = 0
   };
 };
 
 template <>
-struct packet_traits<uint64_t> : default_packet_traits
-{
+struct packet_traits<uint64_t> : default_packet_traits {
   typedef Packet2ul type;
   typedef Packet2ul half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0
+    HasBlend = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2f>
-{
+template <>
+struct unpacket_traits<Packet2f> {
   typedef float type;
   typedef Packet2f half;
   typedef Packet2i integer_packet;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -478,13 +453,12 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4f>
-{
+template <>
+struct unpacket_traits<Packet4f> {
   typedef float type;
   typedef Packet2f half;
   typedef Packet4i integer_packet;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -492,12 +466,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4c>
-{
+template <>
+struct unpacket_traits<Packet4c> {
   typedef int8_t type;
   typedef Packet4c half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Unaligned,
     vectorizable = true,
@@ -505,12 +478,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet8c>
-{
+template <>
+struct unpacket_traits<Packet8c> {
   typedef int8_t type;
   typedef Packet4c half;
-  enum
-  {
+  enum {
     size = 8,
     alignment = Aligned16,
     vectorizable = true,
@@ -518,12 +490,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet16c>
-{
+template <>
+struct unpacket_traits<Packet16c> {
   typedef int8_t type;
   typedef Packet8c half;
-  enum
-  {
+  enum {
     size = 16,
     alignment = Aligned16,
     vectorizable = true,
@@ -531,12 +502,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4uc>
-{
+template <>
+struct unpacket_traits<Packet4uc> {
   typedef uint8_t type;
   typedef Packet4uc half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Unaligned,
     vectorizable = true,
@@ -544,12 +514,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet8uc>
-{
+template <>
+struct unpacket_traits<Packet8uc> {
   typedef uint8_t type;
   typedef Packet4uc half;
-  enum
-  {
+  enum {
     size = 8,
     alignment = Aligned16,
     vectorizable = true,
@@ -557,24 +526,23 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet16uc>
-{
+template <>
+struct unpacket_traits<Packet16uc> {
   typedef uint8_t type;
   typedef Packet8uc half;
-  enum
-  {
+  enum {
     size = 16,
     alignment = Aligned16,
     vectorizable = true,
     masked_load_available = false,
-    masked_store_available = false};
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet4s>
-{
+template <>
+struct unpacket_traits<Packet4s> {
   typedef int16_t type;
   typedef Packet4s half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -582,12 +550,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet8s>
-{
+template <>
+struct unpacket_traits<Packet8s> {
   typedef int16_t type;
   typedef Packet4s half;
-  enum
-  {
+  enum {
     size = 8,
     alignment = Aligned16,
     vectorizable = true,
@@ -595,12 +562,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4us>
-{
+template <>
+struct unpacket_traits<Packet4us> {
   typedef uint16_t type;
   typedef Packet4us half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -608,12 +574,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet8us>
-{
+template <>
+struct unpacket_traits<Packet8us> {
   typedef uint16_t type;
   typedef Packet4us half;
-  enum
-  {
+  enum {
     size = 8,
     alignment = Aligned16,
     vectorizable = true,
@@ -621,12 +586,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet2i>
-{
+template <>
+struct unpacket_traits<Packet2i> {
   typedef int32_t type;
   typedef Packet2i half;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -634,12 +598,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4i>
-{
+template <>
+struct unpacket_traits<Packet4i> {
   typedef int32_t type;
   typedef Packet2i half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -647,12 +610,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet2ui>
-{
+template <>
+struct unpacket_traits<Packet2ui> {
   typedef uint32_t type;
   typedef Packet2ui half;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -660,12 +622,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet4ui>
-{
+template <>
+struct unpacket_traits<Packet4ui> {
   typedef uint32_t type;
   typedef Packet2ui half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -673,12 +634,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet2l>
-{
+template <>
+struct unpacket_traits<Packet2l> {
   typedef int64_t type;
   typedef Packet2l half;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -686,12 +646,11 @@
     masked_store_available = false
   };
 };
-template<> struct unpacket_traits<Packet2ul>
-{
+template <>
+struct unpacket_traits<Packet2ul> {
   typedef uint64_t type;
   typedef Packet2ul half;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -700,1637 +659,2767 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
+  return vdup_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return vdupq_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
+  return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
+  return vdup_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
+  return vdupq_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
+  return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
+  return vdup_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
+  return vdupq_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
+  return vdup_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
+  return vdupq_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
+  return vdup_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
+  return vdupq_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
+  return vdup_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  return vdupq_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
+  return vdup_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return vdupq_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  return vdupq_n_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
+  return vdupq_n_u64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from)
-{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from)
-{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
+  return vreinterpret_f32_u32(vdup_n_u32(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return vreinterpretq_f32_u32(vdupq_n_u32(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
-{
-  const float c[] = {0.0f,1.0f};
+template <>
+EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
+  const float c[] = {0.0f, 1.0f};
   return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
-{
-  const float c[] = {0.0f,1.0f,2.0f,3.0f};
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
   return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
-{
-  const int8_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
+  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
   return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
 }
-template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
-{
-  const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
+  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
-{
-  const uint8_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
+  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
   return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
 }
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
-{
-  const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
+  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
-{
-  const int16_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
+  const int16_t c[] = {0, 1, 2, 3};
   return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
-{
-  const uint16_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
+  const uint16_t c[] = {0, 1, 2, 3};
   return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
 }
-template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
-{
-  const int16_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
+  const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
   return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
 }
-template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
-{
-  const uint16_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
+  const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
   return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
 }
-template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
-{
-  const int32_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
+  const int32_t c[] = {0, 1};
   return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
-{
-  const int32_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  const int32_t c[] = {0, 1, 2, 3};
   return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
-{
-  const uint32_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
+  const uint32_t c[] = {0, 1};
   return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
-{
-  const uint32_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  const uint32_t c[] = {0, 1, 2, 3};
   return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
 }
-template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
-{
-  const int64_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+  const int64_t c[] = {0, 1};
   return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
 }
-template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
-{
-  const uint64_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
+  const uint64_t c[] = {0, 1};
   return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vadd_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vaddq_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vadd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vaddq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vadd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vaddq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vadd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vaddq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vadd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vaddq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vadd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vaddq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vadd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vaddq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vaddq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vaddq_u64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vsub_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vsubq_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vsub_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vsubq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vsub_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vsubq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vsub_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vsubq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vsub_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vsubq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vsub_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vsubq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vsub_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vsubq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vsubq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vsubq_u64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
-template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template <>
+EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
   Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
-template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
+  return vneg_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  return vnegq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
+  return vneg_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+  return vnegq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
+  return vneg_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return vnegq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
+  return vneg_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return vnegq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
 #if EIGEN_ARCH_ARM64
   return vnegq_s64(a);
 #else
-  return vcombine_s64(
-      vdup_n_s64(-vgetq_lane_s64(a, 0)),
-      vdup_n_s64(-vgetq_lane_s64(a, 1)));
+  return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
+  return a;
 }
-template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
-  return vcombine_s64(
-    vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
-    vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
+  return a;
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
-  return vcombine_u64(
-    vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
-    vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
+  return a;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmul_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmulq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmul_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vmulq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmul_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vmulq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmul_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vmulq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmul_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vmulq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmul_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vmulq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmul_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vmulq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4c>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet8c>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet16c>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4uc>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet8uc>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet16uc>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4s>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet8s>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4us>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet8us>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet2i>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4i>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet2ui>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4ui>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet2l>(0LL);
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
   eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet2ul>(0ULL);
 }
 
-
 #ifdef __ARM_FEATURE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{ return vfmaq_f32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
-{ return vfma_f32(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
-  return vmlaq_f32(c,a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmaq_f32(c, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
-{
-  return vmla_f32(c,a,b);
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfma_f32(c, a, b);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlaq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmla_f32(c, a, b);
 }
 #endif
 
 // No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
-      vreinterpret_s8_s32(vdup_n_s32(c)),
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
+                                  vreinterpret_s8_s32(vdup_n_s32(b)))),
+      0);
 }
-template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
-{ return vmla_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
-{ return vmlaq_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
-      vreinterpret_u8_u32(vdup_n_u32(c)),
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
+  return vmla_s8(c, a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
-{ return vmla_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
-{ return vmlaq_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
-{ return vmla_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
-{ return vmlaq_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
-{ return vmla_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
-{ return vmlaq_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
-{ return vmla_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
-{ return vmlaq_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
-{ return vmla_u32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
-{ return vmlaq_u32(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return vmlaq_s8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
+                                  vreinterpret_u8_u32(vdup_n_u32(b)))),
+      0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
+  return vmla_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
+  return vmlaq_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
+  return vmla_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return vmlaq_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
+  return vmla_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return vmlaq_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
+  return vmla_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return vmlaq_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
+  return vmla_u32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+  return vmlaq_u32(c, a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vabd_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vabdq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vabd_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vabd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vabdq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vabdq_f32(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vabd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vabdq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vabd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vabdq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vabd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vabdq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vabd_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vabdq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vabd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vabdq_u32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vabd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vabdq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vabd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vabdq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vabd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vabdq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vabd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vabdq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vabd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vabdq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vabd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vabdq_u32(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmin_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vminq_f32(a, b);
+}
 
 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vminnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vminnm_f32(a, b);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
-  return vcombine_s64(
-      vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
-      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
-  return vcombine_u64(
-      vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
-      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmin<Packet4f>(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pmin<Packet2f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmin_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vminq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmin_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vminq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmin_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vminq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmin_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vminq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmin_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vminq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmin_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vminq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+                      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+                      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmax_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmaxq_f32(a, b);
+}
 
 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmaxnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmaxnm_f32(a, b);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
-  return vcombine_s64(
-      vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
-      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
-  return vcombine_u64(
-      vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
-      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmax<Packet4f>(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pmax<Packet2f>(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vcle_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcleq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vcle_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcleq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vcle_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcleq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmax_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vmaxq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmax_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vmaxq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmax_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vmaxq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmax_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vmaxq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmax_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vmaxq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmax_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vmaxq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+                      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+                      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vcle_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcleq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vcle_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vcleq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vcle_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vcleq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vcle_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vcleq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vcle_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vcleq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vcle_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vcleq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vcle_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vcleq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
 #if EIGEN_ARCH_ARM64
-  return vreinterpretq_s64_u64(vcleq_s64(a,b));
+  return vreinterpretq_s64_u64(vcleq_s64(a, b));
 #else
-  return vcombine_s64(
-      vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
-      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
 #if EIGEN_ARCH_ARM64
-  return vcleq_u64(a,b);
+  return vcleq_u64(a, b);
 #else
-  return vcombine_u64(
-      vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
-      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vclt_f32(a, b));
 }
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcltq_f32(a, b));
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vclt_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcltq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vclt_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcltq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vclt_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcltq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vclt_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vcltq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vclt_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vcltq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vclt_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vcltq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vclt_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vcltq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vclt_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vcltq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vclt_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vcltq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
 #if EIGEN_ARCH_ARM64
-  return vreinterpretq_s64_u64(vcltq_s64(a,b));
+  return vreinterpretq_s64_u64(vcltq_s64(a, b));
 #else
-  return vcombine_s64(
-      vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
-      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
 #if EIGEN_ARCH_ARM64
-  return vcltq_u64(a,b);
+  return vcltq_u64(a, b);
 #else
-  return vcombine_u64(
-      vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
-      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vceq_f32(a, b));
 }
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vceqq_f32(a, b));
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vceq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vceqq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vceq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vceqq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vceq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vceqq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vceq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vceqq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vceq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vceqq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vceq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vceqq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vceq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vceqq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vceq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vceqq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vceq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vceqq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
 #if EIGEN_ARCH_ARM64
-  return vreinterpretq_s64_u64(vceqq_s64(a,b));
+  return vreinterpretq_s64_u64(vceqq_s64(a, b));
 #else
-  return vcombine_s64(
-      vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
-      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
 #if EIGEN_ARCH_ARM64
-  return vceqq_u64(a,b);
+  return vceqq_u64(a, b);
 #else
-  return vcombine_u64(
-      vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
-      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
+}
 
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vand_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vandq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vand_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vandq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vand_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vandq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vand_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vandq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vandq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vand_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vandq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vand_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vandq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vand_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vandq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vand_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vandq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vand_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vandq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vand_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vandq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vandq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vandq_u64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vorrq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vorr_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vorrq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vorr_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vorrq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vorr_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vorrq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vorr_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vorrq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vorrq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vorrq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vorr_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vorrq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vorr_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vorrq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vorr_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vorrq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vorr_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vorrq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vorr_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vorrq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vorr_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vorrq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vorrq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vorrq_u64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return veor_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return veorq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return veor_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return veorq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return veor_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return veorq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return veor_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return veorq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return veorq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return veorq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return veor_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return veorq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return veor_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return veorq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return veor_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return veorq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return veor_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return veorq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return veor_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return veorq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return veor_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return veorq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return veorq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return veorq_u64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vbic_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vbicq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vbic_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vbicq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vbic_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vbicq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vbic_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vbicq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vbic_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vbicq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vbicq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vbicq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vbic_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vbicq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vbic_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vbicq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vbic_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vbicq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vbic_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vbicq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vbic_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vbicq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vbic_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vbicq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vbicq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vbicq_u64(a, b);
+}
 
+template <int N>
+EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
+  return vshr_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
+  return vshrq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
+  return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
+  return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
+  return vshr_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+  return vshrq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
+  return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
+  return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
+  return vshr_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
+  return vshrq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
+  return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
+  return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
+  return vshrq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
+  return vshrq_n_u64(a, N);
+}
 
-template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
+  return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
+  return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
+  return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
+  return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
+  return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+  return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
+  return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
+  return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
+  return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
+  return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
+  return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
+  return vshrq_n_u64(a, N);
+}
 
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
-{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
-{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
-{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
-{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
-{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
-{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
-{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
+  return vshl_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
+  return vshlq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
+  return vshl_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
+  return vshlq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
+  return vshl_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+  return vshlq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
+  return vshl_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
+  return vshlq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
+  return vshl_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
+  return vshlq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
+  return vshl_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
+  return vshlq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
+  return vshlq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
+  return vshlq_n_u64(a, N);
+}
 
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
   Packet4c res;
   memcpy(&res, from, sizeof(Packet4c));
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
   Packet4uc res;
   memcpy(&res, from, sizeof(Packet4uc));
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
   Packet4c res;
   memcpy(&res, from, sizeof(Packet4c));
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
   Packet4uc res;
   memcpy(&res, from, sizeof(Packet4uc));
   return res;
 }
-template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
-{ return vld1_dup_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
+  return vld1_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
   const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
-  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
 }
-template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
   const int8x8_t a = vld1_s8(from);
-  return vzip_s8(a,a).val[0];
+  return vzip_s8(a, a).val[0];
 }
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
   const int8x8_t a = vld1_s8(from);
-  const int8x8x2_t b = vzip_s8(a,a);
+  const int8x8x2_t b = vzip_s8(a, a);
   return vcombine_s8(b.val[0], b.val[1]);
 }
-template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
   const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
-  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
+  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
 }
-template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
   const uint8x8_t a = vld1_u8(from);
-  return vzip_u8(a,a).val[0];
+  return vzip_u8(a, a).val[0];
 }
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
   const uint8x8_t a = vld1_u8(from);
-  const uint8x8x2_t b = vzip_u8(a,a);
+  const uint8x8x2_t b = vzip_u8(a, a);
   return vcombine_u8(b.val[0], b.val[1]);
 }
-template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
-{
-  return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
-      vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
+  return vreinterpret_s16_u32(
+      vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
 }
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
   const int16x4_t a = vld1_s16(from);
-  const int16x4x2_t b = vzip_s16(a,a);
+  const int16x4x2_t b = vzip_s16(a, a);
   return vcombine_s16(b.val[0], b.val[1]);
 }
-template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
-{
-  return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
-      vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
+  return vreinterpret_u16_u32(
+      vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
 }
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
   const uint16x4_t a = vld1_u16(from);
-  const uint16x4x2_t b = vzip_u16(a,a);
+  const uint16x4x2_t b = vzip_u16(a, a);
   return vcombine_u16(b.val[0], b.val[1]);
 }
-template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
-{ return vld1_dup_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
-{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
-{ return vld1_dup_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
-{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
-{ return vld1q_dup_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
-{ return vld1q_dup_u64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
-{
-  return vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
+  return vld1_dup_s32(from);
 }
-template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
-{
-  const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
-  const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from+2)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
-  return vcombine_s8(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
 }
-template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
-{
-  return vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
+  return vld1_dup_u32(from);
 }
-template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
-{
-  const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
-  const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from+2)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
-  return vcombine_u8(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+  return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
 }
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
-{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
-{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+  return vld1q_dup_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
+  return vld1q_dup_u64(from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
+  return vld1q_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
+  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
+  const int8x8_t a = vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+  const int8x8_t b = vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
+  return vcombine_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
+  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
+  const uint8x8_t a = vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+  const uint8x8_t b = vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
+  return vcombine_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
+  return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
+  return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
+  return vld1q_dup_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
+  return vld1q_dup_u32(from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
+}
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
   Packet2f res = vld1_dup_f32(from);
-  res = vld1_lane_f32(from + 1*stride, res, 1);
+  res = vld1_lane_f32(from + 1 * stride, res, 1);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
   Packet4f res = vld1q_dup_f32(from);
-  res = vld1q_lane_f32(from + 1*stride, res, 1);
-  res = vld1q_lane_f32(from + 2*stride, res, 2);
-  res = vld1q_lane_f32(from + 3*stride, res, 3);
+  res = vld1q_lane_f32(from + 1 * stride, res, 1);
+  res = vld1q_lane_f32(from + 2 * stride, res, 2);
+  res = vld1q_lane_f32(from + 3 * stride, res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
   Packet4c res;
-  for (int i = 0; i != 4; i++)
-    reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+  for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
   Packet8c res = vld1_dup_s8(from);
-  res = vld1_lane_s8(from + 1*stride, res, 1);
-  res = vld1_lane_s8(from + 2*stride, res, 2);
-  res = vld1_lane_s8(from + 3*stride, res, 3);
-  res = vld1_lane_s8(from + 4*stride, res, 4);
-  res = vld1_lane_s8(from + 5*stride, res, 5);
-  res = vld1_lane_s8(from + 6*stride, res, 6);
-  res = vld1_lane_s8(from + 7*stride, res, 7);
+  res = vld1_lane_s8(from + 1 * stride, res, 1);
+  res = vld1_lane_s8(from + 2 * stride, res, 2);
+  res = vld1_lane_s8(from + 3 * stride, res, 3);
+  res = vld1_lane_s8(from + 4 * stride, res, 4);
+  res = vld1_lane_s8(from + 5 * stride, res, 5);
+  res = vld1_lane_s8(from + 6 * stride, res, 6);
+  res = vld1_lane_s8(from + 7 * stride, res, 7);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
   Packet16c res = vld1q_dup_s8(from);
-  res = vld1q_lane_s8(from + 1*stride, res, 1);
-  res = vld1q_lane_s8(from + 2*stride, res, 2);
-  res = vld1q_lane_s8(from + 3*stride, res, 3);
-  res = vld1q_lane_s8(from + 4*stride, res, 4);
-  res = vld1q_lane_s8(from + 5*stride, res, 5);
-  res = vld1q_lane_s8(from + 6*stride, res, 6);
-  res = vld1q_lane_s8(from + 7*stride, res, 7);
-  res = vld1q_lane_s8(from + 8*stride, res, 8);
-  res = vld1q_lane_s8(from + 9*stride, res, 9);
-  res = vld1q_lane_s8(from + 10*stride, res, 10);
-  res = vld1q_lane_s8(from + 11*stride, res, 11);
-  res = vld1q_lane_s8(from + 12*stride, res, 12);
-  res = vld1q_lane_s8(from + 13*stride, res, 13);
-  res = vld1q_lane_s8(from + 14*stride, res, 14);
-  res = vld1q_lane_s8(from + 15*stride, res, 15);
+  res = vld1q_lane_s8(from + 1 * stride, res, 1);
+  res = vld1q_lane_s8(from + 2 * stride, res, 2);
+  res = vld1q_lane_s8(from + 3 * stride, res, 3);
+  res = vld1q_lane_s8(from + 4 * stride, res, 4);
+  res = vld1q_lane_s8(from + 5 * stride, res, 5);
+  res = vld1q_lane_s8(from + 6 * stride, res, 6);
+  res = vld1q_lane_s8(from + 7 * stride, res, 7);
+  res = vld1q_lane_s8(from + 8 * stride, res, 8);
+  res = vld1q_lane_s8(from + 9 * stride, res, 9);
+  res = vld1q_lane_s8(from + 10 * stride, res, 10);
+  res = vld1q_lane_s8(from + 11 * stride, res, 11);
+  res = vld1q_lane_s8(from + 12 * stride, res, 12);
+  res = vld1q_lane_s8(from + 13 * stride, res, 13);
+  res = vld1q_lane_s8(from + 14 * stride, res, 14);
+  res = vld1q_lane_s8(from + 15 * stride, res, 15);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
   Packet4uc res;
-  for (int i = 0; i != 4; i++)
-    reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+  for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
   Packet8uc res = vld1_dup_u8(from);
-  res = vld1_lane_u8(from + 1*stride, res, 1);
-  res = vld1_lane_u8(from + 2*stride, res, 2);
-  res = vld1_lane_u8(from + 3*stride, res, 3);
-  res = vld1_lane_u8(from + 4*stride, res, 4);
-  res = vld1_lane_u8(from + 5*stride, res, 5);
-  res = vld1_lane_u8(from + 6*stride, res, 6);
-  res = vld1_lane_u8(from + 7*stride, res, 7);
+  res = vld1_lane_u8(from + 1 * stride, res, 1);
+  res = vld1_lane_u8(from + 2 * stride, res, 2);
+  res = vld1_lane_u8(from + 3 * stride, res, 3);
+  res = vld1_lane_u8(from + 4 * stride, res, 4);
+  res = vld1_lane_u8(from + 5 * stride, res, 5);
+  res = vld1_lane_u8(from + 6 * stride, res, 6);
+  res = vld1_lane_u8(from + 7 * stride, res, 7);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
   Packet16uc res = vld1q_dup_u8(from);
-  res = vld1q_lane_u8(from + 1*stride, res, 1);
-  res = vld1q_lane_u8(from + 2*stride, res, 2);
-  res = vld1q_lane_u8(from + 3*stride, res, 3);
-  res = vld1q_lane_u8(from + 4*stride, res, 4);
-  res = vld1q_lane_u8(from + 5*stride, res, 5);
-  res = vld1q_lane_u8(from + 6*stride, res, 6);
-  res = vld1q_lane_u8(from + 7*stride, res, 7);
-  res = vld1q_lane_u8(from + 8*stride, res, 8);
-  res = vld1q_lane_u8(from + 9*stride, res, 9);
-  res = vld1q_lane_u8(from + 10*stride, res, 10);
-  res = vld1q_lane_u8(from + 11*stride, res, 11);
-  res = vld1q_lane_u8(from + 12*stride, res, 12);
-  res = vld1q_lane_u8(from + 13*stride, res, 13);
-  res = vld1q_lane_u8(from + 14*stride, res, 14);
-  res = vld1q_lane_u8(from + 15*stride, res, 15);
+  res = vld1q_lane_u8(from + 1 * stride, res, 1);
+  res = vld1q_lane_u8(from + 2 * stride, res, 2);
+  res = vld1q_lane_u8(from + 3 * stride, res, 3);
+  res = vld1q_lane_u8(from + 4 * stride, res, 4);
+  res = vld1q_lane_u8(from + 5 * stride, res, 5);
+  res = vld1q_lane_u8(from + 6 * stride, res, 6);
+  res = vld1q_lane_u8(from + 7 * stride, res, 7);
+  res = vld1q_lane_u8(from + 8 * stride, res, 8);
+  res = vld1q_lane_u8(from + 9 * stride, res, 9);
+  res = vld1q_lane_u8(from + 10 * stride, res, 10);
+  res = vld1q_lane_u8(from + 11 * stride, res, 11);
+  res = vld1q_lane_u8(from + 12 * stride, res, 12);
+  res = vld1q_lane_u8(from + 13 * stride, res, 13);
+  res = vld1q_lane_u8(from + 14 * stride, res, 14);
+  res = vld1q_lane_u8(from + 15 * stride, res, 15);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
   Packet4s res = vld1_dup_s16(from);
-  res = vld1_lane_s16(from + 1*stride, res, 1);
-  res = vld1_lane_s16(from + 2*stride, res, 2);
-  res = vld1_lane_s16(from + 3*stride, res, 3);
+  res = vld1_lane_s16(from + 1 * stride, res, 1);
+  res = vld1_lane_s16(from + 2 * stride, res, 2);
+  res = vld1_lane_s16(from + 3 * stride, res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
   Packet8s res = vld1q_dup_s16(from);
-  res = vld1q_lane_s16(from + 1*stride, res, 1);
-  res = vld1q_lane_s16(from + 2*stride, res, 2);
-  res = vld1q_lane_s16(from + 3*stride, res, 3);
-  res = vld1q_lane_s16(from + 4*stride, res, 4);
-  res = vld1q_lane_s16(from + 5*stride, res, 5);
-  res = vld1q_lane_s16(from + 6*stride, res, 6);
-  res = vld1q_lane_s16(from + 7*stride, res, 7);
+  res = vld1q_lane_s16(from + 1 * stride, res, 1);
+  res = vld1q_lane_s16(from + 2 * stride, res, 2);
+  res = vld1q_lane_s16(from + 3 * stride, res, 3);
+  res = vld1q_lane_s16(from + 4 * stride, res, 4);
+  res = vld1q_lane_s16(from + 5 * stride, res, 5);
+  res = vld1q_lane_s16(from + 6 * stride, res, 6);
+  res = vld1q_lane_s16(from + 7 * stride, res, 7);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
   Packet4us res = vld1_dup_u16(from);
-  res = vld1_lane_u16(from + 1*stride, res, 1);
-  res = vld1_lane_u16(from + 2*stride, res, 2);
-  res = vld1_lane_u16(from + 3*stride, res, 3);
+  res = vld1_lane_u16(from + 1 * stride, res, 1);
+  res = vld1_lane_u16(from + 2 * stride, res, 2);
+  res = vld1_lane_u16(from + 3 * stride, res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
   Packet8us res = vld1q_dup_u16(from);
-  res = vld1q_lane_u16(from + 1*stride, res, 1);
-  res = vld1q_lane_u16(from + 2*stride, res, 2);
-  res = vld1q_lane_u16(from + 3*stride, res, 3);
-  res = vld1q_lane_u16(from + 4*stride, res, 4);
-  res = vld1q_lane_u16(from + 5*stride, res, 5);
-  res = vld1q_lane_u16(from + 6*stride, res, 6);
-  res = vld1q_lane_u16(from + 7*stride, res, 7);
+  res = vld1q_lane_u16(from + 1 * stride, res, 1);
+  res = vld1q_lane_u16(from + 2 * stride, res, 2);
+  res = vld1q_lane_u16(from + 3 * stride, res, 3);
+  res = vld1q_lane_u16(from + 4 * stride, res, 4);
+  res = vld1q_lane_u16(from + 5 * stride, res, 5);
+  res = vld1q_lane_u16(from + 6 * stride, res, 6);
+  res = vld1q_lane_u16(from + 7 * stride, res, 7);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
   Packet2i res = vld1_dup_s32(from);
-  res = vld1_lane_s32(from + 1*stride, res, 1);
+  res = vld1_lane_s32(from + 1 * stride, res, 1);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
   Packet4i res = vld1q_dup_s32(from);
-  res = vld1q_lane_s32(from + 1*stride, res, 1);
-  res = vld1q_lane_s32(from + 2*stride, res, 2);
-  res = vld1q_lane_s32(from + 3*stride, res, 3);
+  res = vld1q_lane_s32(from + 1 * stride, res, 1);
+  res = vld1q_lane_s32(from + 2 * stride, res, 2);
+  res = vld1q_lane_s32(from + 3 * stride, res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
   Packet2ui res = vld1_dup_u32(from);
-  res = vld1_lane_u32(from + 1*stride, res, 1);
+  res = vld1_lane_u32(from + 1 * stride, res, 1);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
   Packet4ui res = vld1q_dup_u32(from);
-  res = vld1q_lane_u32(from + 1*stride, res, 1);
-  res = vld1q_lane_u32(from + 2*stride, res, 2);
-  res = vld1q_lane_u32(from + 3*stride, res, 3);
+  res = vld1q_lane_u32(from + 1 * stride, res, 1);
+  res = vld1q_lane_u32(from + 2 * stride, res, 2);
+  res = vld1q_lane_u32(from + 3 * stride, res, 3);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
   Packet2l res = vld1q_dup_s64(from);
-  res = vld1q_lane_s64(from + 1*stride, res, 1);
+  res = vld1q_lane_s64(from + 1 * stride, res, 1);
   return res;
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
   Packet2ul res = vld1q_dup_u64(from);
-  res = vld1q_lane_u64(from + 1*stride, res, 1);
+  res = vld1q_lane_u64(from + 1 * stride, res, 1);
   return res;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
-{
-  vst1_lane_f32(to + stride*0, from, 0);
-  vst1_lane_f32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
+  vst1_lane_f32(to + stride * 0, from, 0);
+  vst1_lane_f32(to + stride * 1, from, 1);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  vst1q_lane_f32(to + stride*0, from, 0);
-  vst1q_lane_f32(to + stride*1, from, 1);
-  vst1q_lane_f32(to + stride*2, from, 2);
-  vst1q_lane_f32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  vst1q_lane_f32(to + stride * 0, from, 0);
+  vst1q_lane_f32(to + stride * 1, from, 1);
+  vst1q_lane_f32(to + stride * 2, from, 2);
+  vst1q_lane_f32(to + stride * 3, from, 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
-{
-  for (int i = 0; i != 4; i++)
-    *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
+  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
-{
-  vst1_lane_s8(to + stride*0, from, 0);
-  vst1_lane_s8(to + stride*1, from, 1);
-  vst1_lane_s8(to + stride*2, from, 2);
-  vst1_lane_s8(to + stride*3, from, 3);
-  vst1_lane_s8(to + stride*4, from, 4);
-  vst1_lane_s8(to + stride*5, from, 5);
-  vst1_lane_s8(to + stride*6, from, 6);
-  vst1_lane_s8(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
+  vst1_lane_s8(to + stride * 0, from, 0);
+  vst1_lane_s8(to + stride * 1, from, 1);
+  vst1_lane_s8(to + stride * 2, from, 2);
+  vst1_lane_s8(to + stride * 3, from, 3);
+  vst1_lane_s8(to + stride * 4, from, 4);
+  vst1_lane_s8(to + stride * 5, from, 5);
+  vst1_lane_s8(to + stride * 6, from, 6);
+  vst1_lane_s8(to + stride * 7, from, 7);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
-{
-  vst1q_lane_s8(to + stride*0, from, 0);
-  vst1q_lane_s8(to + stride*1, from, 1);
-  vst1q_lane_s8(to + stride*2, from, 2);
-  vst1q_lane_s8(to + stride*3, from, 3);
-  vst1q_lane_s8(to + stride*4, from, 4);
-  vst1q_lane_s8(to + stride*5, from, 5);
-  vst1q_lane_s8(to + stride*6, from, 6);
-  vst1q_lane_s8(to + stride*7, from, 7);
-  vst1q_lane_s8(to + stride*8, from, 8);
-  vst1q_lane_s8(to + stride*9, from, 9);
-  vst1q_lane_s8(to + stride*10, from, 10);
-  vst1q_lane_s8(to + stride*11, from, 11);
-  vst1q_lane_s8(to + stride*12, from, 12);
-  vst1q_lane_s8(to + stride*13, from, 13);
-  vst1q_lane_s8(to + stride*14, from, 14);
-  vst1q_lane_s8(to + stride*15, from, 15);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
+                                                                       Index stride) {
+  vst1q_lane_s8(to + stride * 0, from, 0);
+  vst1q_lane_s8(to + stride * 1, from, 1);
+  vst1q_lane_s8(to + stride * 2, from, 2);
+  vst1q_lane_s8(to + stride * 3, from, 3);
+  vst1q_lane_s8(to + stride * 4, from, 4);
+  vst1q_lane_s8(to + stride * 5, from, 5);
+  vst1q_lane_s8(to + stride * 6, from, 6);
+  vst1q_lane_s8(to + stride * 7, from, 7);
+  vst1q_lane_s8(to + stride * 8, from, 8);
+  vst1q_lane_s8(to + stride * 9, from, 9);
+  vst1q_lane_s8(to + stride * 10, from, 10);
+  vst1q_lane_s8(to + stride * 11, from, 11);
+  vst1q_lane_s8(to + stride * 12, from, 12);
+  vst1q_lane_s8(to + stride * 13, from, 13);
+  vst1q_lane_s8(to + stride * 14, from, 14);
+  vst1q_lane_s8(to + stride * 15, from, 15);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
-{
-  for (int i = 0; i != 4; i++)
-    *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
+                                                                        Index stride) {
+  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
-{
-  vst1_lane_u8(to + stride*0, from, 0);
-  vst1_lane_u8(to + stride*1, from, 1);
-  vst1_lane_u8(to + stride*2, from, 2);
-  vst1_lane_u8(to + stride*3, from, 3);
-  vst1_lane_u8(to + stride*4, from, 4);
-  vst1_lane_u8(to + stride*5, from, 5);
-  vst1_lane_u8(to + stride*6, from, 6);
-  vst1_lane_u8(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
+                                                                        Index stride) {
+  vst1_lane_u8(to + stride * 0, from, 0);
+  vst1_lane_u8(to + stride * 1, from, 1);
+  vst1_lane_u8(to + stride * 2, from, 2);
+  vst1_lane_u8(to + stride * 3, from, 3);
+  vst1_lane_u8(to + stride * 4, from, 4);
+  vst1_lane_u8(to + stride * 5, from, 5);
+  vst1_lane_u8(to + stride * 6, from, 6);
+  vst1_lane_u8(to + stride * 7, from, 7);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
-{
-  vst1q_lane_u8(to + stride*0, from, 0);
-  vst1q_lane_u8(to + stride*1, from, 1);
-  vst1q_lane_u8(to + stride*2, from, 2);
-  vst1q_lane_u8(to + stride*3, from, 3);
-  vst1q_lane_u8(to + stride*4, from, 4);
-  vst1q_lane_u8(to + stride*5, from, 5);
-  vst1q_lane_u8(to + stride*6, from, 6);
-  vst1q_lane_u8(to + stride*7, from, 7);
-  vst1q_lane_u8(to + stride*8, from, 8);
-  vst1q_lane_u8(to + stride*9, from, 9);
-  vst1q_lane_u8(to + stride*10, from, 10);
-  vst1q_lane_u8(to + stride*11, from, 11);
-  vst1q_lane_u8(to + stride*12, from, 12);
-  vst1q_lane_u8(to + stride*13, from, 13);
-  vst1q_lane_u8(to + stride*14, from, 14);
-  vst1q_lane_u8(to + stride*15, from, 15);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
+                                                                         Index stride) {
+  vst1q_lane_u8(to + stride * 0, from, 0);
+  vst1q_lane_u8(to + stride * 1, from, 1);
+  vst1q_lane_u8(to + stride * 2, from, 2);
+  vst1q_lane_u8(to + stride * 3, from, 3);
+  vst1q_lane_u8(to + stride * 4, from, 4);
+  vst1q_lane_u8(to + stride * 5, from, 5);
+  vst1q_lane_u8(to + stride * 6, from, 6);
+  vst1q_lane_u8(to + stride * 7, from, 7);
+  vst1q_lane_u8(to + stride * 8, from, 8);
+  vst1q_lane_u8(to + stride * 9, from, 9);
+  vst1q_lane_u8(to + stride * 10, from, 10);
+  vst1q_lane_u8(to + stride * 11, from, 11);
+  vst1q_lane_u8(to + stride * 12, from, 12);
+  vst1q_lane_u8(to + stride * 13, from, 13);
+  vst1q_lane_u8(to + stride * 14, from, 14);
+  vst1q_lane_u8(to + stride * 15, from, 15);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
-{
-  vst1_lane_s16(to + stride*0, from, 0);
-  vst1_lane_s16(to + stride*1, from, 1);
-  vst1_lane_s16(to + stride*2, from, 2);
-  vst1_lane_s16(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
+                                                                       Index stride) {
+  vst1_lane_s16(to + stride * 0, from, 0);
+  vst1_lane_s16(to + stride * 1, from, 1);
+  vst1_lane_s16(to + stride * 2, from, 2);
+  vst1_lane_s16(to + stride * 3, from, 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
-{
-  vst1q_lane_s16(to + stride*0, from, 0);
-  vst1q_lane_s16(to + stride*1, from, 1);
-  vst1q_lane_s16(to + stride*2, from, 2);
-  vst1q_lane_s16(to + stride*3, from, 3);
-  vst1q_lane_s16(to + stride*4, from, 4);
-  vst1q_lane_s16(to + stride*5, from, 5);
-  vst1q_lane_s16(to + stride*6, from, 6);
-  vst1q_lane_s16(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
+                                                                       Index stride) {
+  vst1q_lane_s16(to + stride * 0, from, 0);
+  vst1q_lane_s16(to + stride * 1, from, 1);
+  vst1q_lane_s16(to + stride * 2, from, 2);
+  vst1q_lane_s16(to + stride * 3, from, 3);
+  vst1q_lane_s16(to + stride * 4, from, 4);
+  vst1q_lane_s16(to + stride * 5, from, 5);
+  vst1q_lane_s16(to + stride * 6, from, 6);
+  vst1q_lane_s16(to + stride * 7, from, 7);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
-{
-  vst1_lane_u16(to + stride*0, from, 0);
-  vst1_lane_u16(to + stride*1, from, 1);
-  vst1_lane_u16(to + stride*2, from, 2);
-  vst1_lane_u16(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
+                                                                         Index stride) {
+  vst1_lane_u16(to + stride * 0, from, 0);
+  vst1_lane_u16(to + stride * 1, from, 1);
+  vst1_lane_u16(to + stride * 2, from, 2);
+  vst1_lane_u16(to + stride * 3, from, 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
-{
-  vst1q_lane_u16(to + stride*0, from, 0);
-  vst1q_lane_u16(to + stride*1, from, 1);
-  vst1q_lane_u16(to + stride*2, from, 2);
-  vst1q_lane_u16(to + stride*3, from, 3);
-  vst1q_lane_u16(to + stride*4, from, 4);
-  vst1q_lane_u16(to + stride*5, from, 5);
-  vst1q_lane_u16(to + stride*6, from, 6);
-  vst1q_lane_u16(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
+                                                                         Index stride) {
+  vst1q_lane_u16(to + stride * 0, from, 0);
+  vst1q_lane_u16(to + stride * 1, from, 1);
+  vst1q_lane_u16(to + stride * 2, from, 2);
+  vst1q_lane_u16(to + stride * 3, from, 3);
+  vst1q_lane_u16(to + stride * 4, from, 4);
+  vst1q_lane_u16(to + stride * 5, from, 5);
+  vst1q_lane_u16(to + stride * 6, from, 6);
+  vst1q_lane_u16(to + stride * 7, from, 7);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
-{
-  vst1_lane_s32(to + stride*0, from, 0);
-  vst1_lane_s32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
+                                                                       Index stride) {
+  vst1_lane_s32(to + stride * 0, from, 0);
+  vst1_lane_s32(to + stride * 1, from, 1);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
-{
-  vst1q_lane_s32(to + stride*0, from, 0);
-  vst1q_lane_s32(to + stride*1, from, 1);
-  vst1q_lane_s32(to + stride*2, from, 2);
-  vst1q_lane_s32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                                       Index stride) {
+  vst1q_lane_s32(to + stride * 0, from, 0);
+  vst1q_lane_s32(to + stride * 1, from, 1);
+  vst1q_lane_s32(to + stride * 2, from, 2);
+  vst1q_lane_s32(to + stride * 3, from, 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
-{
-  vst1_lane_u32(to + stride*0, from, 0);
-  vst1_lane_u32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
+                                                                         Index stride) {
+  vst1_lane_u32(to + stride * 0, from, 0);
+  vst1_lane_u32(to + stride * 1, from, 1);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
-{
-  vst1q_lane_u32(to + stride*0, from, 0);
-  vst1q_lane_u32(to + stride*1, from, 1);
-  vst1q_lane_u32(to + stride*2, from, 2);
-  vst1q_lane_u32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
+                                                                         Index stride) {
+  vst1q_lane_u32(to + stride * 0, from, 0);
+  vst1q_lane_u32(to + stride * 1, from, 1);
+  vst1q_lane_u32(to + stride * 2, from, 2);
+  vst1q_lane_u32(to + stride * 3, from, 3);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
-{
-  vst1q_lane_s64(to + stride*0, from, 0);
-  vst1q_lane_s64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
+                                                                       Index stride) {
+  vst1q_lane_s64(to + stride * 0, from, 0);
+  vst1q_lane_s64(to + stride * 1, from, 1);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
-{
-  vst1q_lane_u64(to + stride*0, from, 0);
-  vst1q_lane_u64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
+                                                                         Index stride) {
+  vst1q_lane_u64(to + stride * 0, from, 0);
+  vst1q_lane_u64(to + stride * 1, from, 1);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
-template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return vgetq_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
+  return static_cast<int8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
+  return vget_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
+  return vgetq_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
+  return static_cast<uint8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
+  return vget_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
+  return vgetq_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
+  return vget_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
+  return vgetq_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
+  return vget_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
+  return vgetq_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  return vgetq_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return vgetq_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
+  return vrev64_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
   const float32x4_t a_r64 = vrev64q_f32(a);
   return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
+  return vrev64_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
   const int8x16_t a_r64 = vrev64q_s8(a);
   return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
+  return vrev64_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
   const uint8x16_t a_r64 = vrev64q_u8(a);
   return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
+  return vrev64_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
   const int16x8_t a_r64 = vrev64q_s16(a);
   return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
+  return vrev64_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
   const uint16x8_t a_r64 = vrev64q_u16(a);
   return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
+  return vrev64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
   const int32x4_t a_r64 = vrev64q_s32(a);
   return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
+  return vrev64_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
   const uint32x4_t a_r64 = vrev64q_u32(a);
   return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
 }
-template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
-{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
-{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+  return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
+  return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
+  return vabs_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return vabsq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
+  return vabs_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return vabsq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
+  return vabs_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return vabsq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
+  return vabs_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return vabsq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
 #if EIGEN_ARCH_ARM64
   return vabsq_s64(a);
 #else
-  return vcombine_s64(
-      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
-      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+  return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
+  return a;
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
@@ -2341,47 +3430,70 @@
   return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
-{ return pfrexp_generic(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
-{ return pfrexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
-{ return pldexp_generic(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
-{ return pldexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
 
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vaddv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { return vaddvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+  return vaddv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  return vaddvq_f32(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpadd_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
   const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
   return vget_lane_f32(vpadd_f32(sum, sum), 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   int8x8_t sum = vpadd_s8(a_dup, a_dup);
   sum = vpadd_s8(sum, sum);
   return vget_lane_s8(sum, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) { return vaddv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) { return vaddvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+  return vaddv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+  return vaddvq_s8(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
-{
-  int8x8_t sum = vpadd_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+  int8x8_t sum = vpadd_s8(a, a);
   sum = vpadd_s8(sum, sum);
   sum = vpadd_s8(sum, sum);
   return vget_lane_s8(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
   int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
   sum = vpadd_s8(sum, sum);
   sum = vpadd_s8(sum, sum);
@@ -2389,144 +3501,204 @@
   return vget_lane_s8(sum, 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   uint8x8_t sum = vpadd_u8(a_dup, a_dup);
   sum = vpadd_u8(sum, sum);
   return vget_lane_u8(sum, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) { return vaddv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) { return vaddvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) { return vaddv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) { return vaddvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) { return vaddv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) { return vaddvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vaddv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { return vaddvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vaddv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) { return vaddvq_u32(a); }
-template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) { return vaddvq_s64(a); }
-template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) { return vaddvq_u64(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+  return vaddv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+  return vaddvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+  return vaddv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+  return vaddvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+  return vaddv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+  return vaddvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+  return vaddv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  return vaddvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+  return vaddv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+  return vaddvq_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return vaddvq_s64(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return vaddvq_u64(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t sum = vpadd_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t sum = vpadd_u8(a, a);
   sum = vpadd_u8(sum, sum);
   sum = vpadd_u8(sum, sum);
   return vget_lane_u8(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
   uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
   sum = vpadd_u8(sum, sum);
   sum = vpadd_u8(sum, sum);
   sum = vpadd_u8(sum, sum);
   return vget_lane_u8(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t sum = vpadd_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+  const int16x4_t sum = vpadd_s16(a, a);
   return vget_lane_s16(vpadd_s16(sum, sum), 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
   int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
   sum = vpadd_s16(sum, sum);
   sum = vpadd_s16(sum, sum);
   return vget_lane_s16(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t sum = vpadd_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+  const uint16x4_t sum = vpadd_u16(a, a);
   return vget_lane_u16(vpadd_u16(sum, sum), 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
   uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
   sum = vpadd_u16(sum, sum);
   sum = vpadd_u16(sum, sum);
   return vget_lane_u16(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpadd_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
   const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
   return vget_lane_s32(vpadd_s32(sum, sum), 0);
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpadd_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
   const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
   return vget_lane_u32(vpadd_u32(sum, sum), 0);
 }
-template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+}
 #endif
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
-      vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
-{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
-      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
+  return vadd_s8(vget_high_s8(a), vget_low_s8(a));
 }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
-{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
-{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
-{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
+  return vadd_u8(vget_high_u8(a), vget_low_u8(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
+  return vadd_s16(vget_high_s16(a), vget_low_s16(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
+  return vadd_u16(vget_high_u16(a), vget_low_u16(a));
+}
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
   int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
   prod = vmul_s8(prod, vrev16_s8(prod));
   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
   int8x8_t prod = vmul_s8(a, vrev16_s8(a));
   prod = vmul_s8(prod, vrev32_s8(prod));
   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+  return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
   uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
   prod = vmul_u8(prod, vrev16_u8(prod));
   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
   uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
   prod = vmul_u8(prod, vrev32_u8(prod));
   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+  return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
   const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
   return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
   int16x4_t prod;
 
   // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
@@ -2536,13 +3708,13 @@
   // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
   return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
   const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
   return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
   uint16x4_t prod;
 
   // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
@@ -2552,52 +3724,78 @@
   // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
   return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
 }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
-template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+  return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
+}
 
 // min
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) { return vminv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { return vminvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+  return vminv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  return vminvq_f32(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmin_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpmin_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
   const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
   return vget_lane_f32(vpmin_f32(min, min), 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   int8x8_t min = vpmin_s8(a_dup, a_dup);
   min = vpmin_s8(min, min);
   return vget_lane_s8(min, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) { return vminv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) { return vminvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+  return vminv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+  return vminvq_s8(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
-{
-  int8x8_t min = vpmin_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+  int8x8_t min = vpmin_s8(a, a);
   min = vpmin_s8(min, min);
   min = vpmin_s8(min, min);
   return vget_lane_s8(min, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
   int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
   min = vpmin_s8(min, min);
   min = vpmin_s8(min, min);
@@ -2605,117 +3803,169 @@
   return vget_lane_s8(min, 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   uint8x8_t min = vpmin_u8(a_dup, a_dup);
   min = vpmin_u8(min, min);
   return vget_lane_u8(min, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) { return vminv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) { return vminvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) { return vminv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) { return vminvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) { return vminv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) { return vminvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) { return vminv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { return vminvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) { return vminv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) { return vminvq_u32(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+  return vminv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+  return vminvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+  return vminv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+  return vminvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+  return vminv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+  return vminvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+  return vminv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  return vminvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+  return vminv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+  return vminvq_u32(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t min = vpmin_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t min = vpmin_u8(a, a);
   min = vpmin_u8(min, min);
   min = vpmin_u8(min, min);
   return vget_lane_u8(min, 0);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
   uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
   min = vpmin_u8(min, min);
   min = vpmin_u8(min, min);
   min = vpmin_u8(min, min);
   return vget_lane_u8(min, 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t min = vpmin_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+  const int16x4_t min = vpmin_s16(a, a);
   return vget_lane_s16(vpmin_s16(min, min), 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
   int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
   min = vpmin_s16(min, min);
   min = vpmin_s16(min, min);
   return vget_lane_s16(min, 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t min = vpmin_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+  const uint16x4_t min = vpmin_u16(a, a);
   return vget_lane_u16(vpmin_u16(min, min), 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
   uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
   min = vpmin_u16(min, min);
   min = vpmin_u16(min, min);
   return vget_lane_u16(min, 0);
 }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmin_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpmin_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
   const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
   return vget_lane_s32(vpmin_s32(min, min), 0);
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmin_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpmin_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
   const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
   return vget_lane_u32(vpmin_u32(min, min), 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
-{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
-{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
+  return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
+  return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
 
 // max
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) { return vmaxv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { return vmaxvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+  return vmaxv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  return vmaxvq_f32(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmax_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpmax_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
   const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
   return vget_lane_f32(vpmax_f32(max, max), 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   int8x8_t max = vpmax_s8(a_dup, a_dup);
   max = vpmax_s8(max, max);
   return vget_lane_s8(max, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) { return vmaxv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) { return vmaxvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+  return vmaxv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+  return vmaxvq_s8(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
-{
-  int8x8_t max = vpmax_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+  int8x8_t max = vpmax_s8(a, a);
   max = vpmax_s8(max, max);
   max = vpmax_s8(max, max);
   return vget_lane_s8(max, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
   int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
   max = vpmax_s8(max, max);
   max = vpmax_s8(max, max);
@@ -2723,201 +3973,238 @@
   return vget_lane_s8(max, 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   uint8x8_t max = vpmax_u8(a_dup, a_dup);
   max = vpmax_u8(max, max);
   return vget_lane_u8(max, 0);
 }
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) { return vmaxv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) { return vmaxvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) { return vmaxv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) { return vmaxvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) { return vmaxv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) { return vmaxvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) { return vmaxv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { return vmaxvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) { return vmaxv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) { return vmaxvq_u32(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+  return vmaxv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+  return vmaxvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+  return vmaxv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+  return vmaxvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+  return vmaxv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+  return vmaxvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+  return vmaxv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  return vmaxvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+  return vmaxv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+  return vmaxvq_u32(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t max = vpmax_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t max = vpmax_u8(a, a);
   max = vpmax_u8(max, max);
   max = vpmax_u8(max, max);
   return vget_lane_u8(max, 0);
 }
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
   uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
   max = vpmax_u8(max, max);
   max = vpmax_u8(max, max);
   max = vpmax_u8(max, max);
   return vget_lane_u8(max, 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t max = vpmax_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+  const int16x4_t max = vpmax_s16(a, a);
   return vget_lane_s16(vpmax_s16(max, max), 0);
 }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
   int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
   max = vpmax_s16(max, max);
   max = vpmax_s16(max, max);
   return vget_lane_s16(max, 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t max = vpmax_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+  const uint16x4_t max = vpmax_u16(a, a);
   return vget_lane_u16(vpmax_u16(max, max), 0);
 }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
   uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
   max = vpmax_u16(max, max);
   max = vpmax_u16(max, max);
   return vget_lane_u16(max, 0);
 }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmax_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpmax_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
   const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
   return vget_lane_s32(vpmax_s32(max, max), 0);
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmax_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpmax_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
   const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
   return vget_lane_u32(vpmax_u32(max, max), 0);
 }
 #endif
-template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
-{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
-{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
+  return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
+  return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
-  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
-                            vget_high_u32(vreinterpretq_u32_f32(x)));
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
+  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
   return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
 }
 
 // Helpers for ptranspose.
 namespace detail {
-  
-template<typename Packet>
+
+template <typename Packet>
 void zip_in_place(Packet& p1, Packet& p2);
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
   const float32x2x2_t tmp = vzip_f32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
   const float32x4x2_t tmp = vzipq_f32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
   const int8x8x2_t tmp = vzip_s8(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
   const int8x16x2_t tmp = vzipq_s8(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
   const uint8x8x2_t tmp = vzip_u8(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
   const uint8x16x2_t tmp = vzipq_u8(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
   const int32x2x2_t tmp = vzip_s32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
   const int32x4x2_t tmp = vzipq_s32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
   const uint32x2x2_t tmp = vzip_u32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
   const uint32x4x2_t tmp = vzipq_u32(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
   const int16x4x2_t tmp = vzip_s16(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
   const int16x8x2_t tmp = vzipq_s16(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
   const uint16x4x2_t tmp = vzip_u16(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<>
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
   const uint16x8x2_t tmp = vzipq_u16(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
   zip_in_place(kernel.packet[0], kernel.packet[1]);
 }
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
   zip_in_place(kernel.packet[0], kernel.packet[2]);
   zip_in_place(kernel.packet[1], kernel.packet[3]);
@@ -2925,7 +4212,7 @@
   zip_in_place(kernel.packet[2], kernel.packet[3]);
 }
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
   zip_in_place(kernel.packet[0], kernel.packet[4]);
   zip_in_place(kernel.packet[1], kernel.packet[5]);
@@ -2936,31 +4223,31 @@
   zip_in_place(kernel.packet[1], kernel.packet[3]);
   zip_in_place(kernel.packet[4], kernel.packet[6]);
   zip_in_place(kernel.packet[5], kernel.packet[7]);
-  
+
   zip_in_place(kernel.packet[0], kernel.packet[1]);
   zip_in_place(kernel.packet[2], kernel.packet[3]);
   zip_in_place(kernel.packet[4], kernel.packet[5]);
   zip_in_place(kernel.packet[6], kernel.packet[7]);
 }
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
   EIGEN_UNROLL_LOOP
-  for (int i=0; i<4; ++i) {
+  for (int i = 0; i < 4; ++i) {
     const int m = (1 << i);
     EIGEN_UNROLL_LOOP
-    for (int j=0; j<m; ++j) {
-      const int n = (1 << (3-i));
+    for (int j = 0; j < m; ++j) {
+      const int n = (1 << (3 - i));
       EIGEN_UNROLL_LOOP
-      for (int k=0; k<n; ++k) {
-        const int idx = 2*j*n+k;
+      for (int k = 0; k < n; ++k) {
+        const int idx = 2 * j * n + k;
         zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
       }
     }
   }
 }
 
-} // namespace detail
+}  // namespace detail
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
   detail::ptranspose_impl(kernel);
@@ -2969,12 +4256,11 @@
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
   const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
   const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
 
-  const int8x8x2_t zip8 = vzip_s8(a,b);
+  const int8x8x2_t zip8 = vzip_s8(a, b);
   const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
 
   kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
@@ -2998,12 +4284,11 @@
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
   const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
   const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
 
-  const uint8x8x2_t zip8 = vzip_u8(a,b);
+  const uint8x8x2_t zip8 = vzip_u8(a, b);
   const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
 
   kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
@@ -3051,7 +4336,7 @@
   detail::ptranspose_impl(kernel);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
-    detail::ptranspose_impl(kernel);
+  detail::ptranspose_impl(kernel);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
   detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
@@ -3060,158 +4345,195 @@
   detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2l, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
 #if EIGEN_ARCH_ARM64
   const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = tmp1;
 #else
-  const int64x1_t tmp[2][2] = {
-    { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
-    { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
-  };
+  const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
+                               {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
 
   kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
   kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
 #endif
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2ul, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
 #if EIGEN_ARCH_ARM64
   const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = tmp1;
 #else
-  const uint64x1_t tmp[2][2] = {
-    { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
-    { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
-  };
+  const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
+                                {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
 
   kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
   kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
 #endif
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
-{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
-{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
-{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
-{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
-{ return vbsl_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
-{ return vbslq_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
-{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
-{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
-{ return vbsl_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
-{ return vbslq_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
-{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
-{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
-{ return vbsl_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
-{ return vbslq_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
-{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
-{ return vbslq_u64(mask, a, b); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
+  return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
+  return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
+  return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
+  return vbsl_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
+                                                         const Packet16uc& b) {
+  return vbslq_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
+  return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
+  return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
+  return vbsl_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
+  return vbslq_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
+  return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
+  return vbsl_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return vbslq_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+  return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
+  return vbslq_u64(mask, a, b);
+}
 
 // Use armv8 rounding intinsics if available.
 #if EIGEN_ARCH_ARMV8
-template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
-{ return vrndn_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
+  return vrndn_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
-{ return vrndnq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return vrndnq_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{ return vrndm_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
+  return vrndm_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{ return vrndmq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vrndmq_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
-{ return vrndp_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
+  return vrndp_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{ return vrndpq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vrndpq_f32(a);
+}
 
 #else
 
-template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
   const Packet4f abs_a = pabs(a);
   Packet4f r = padd(abs_a, limit);
   // Don't compile-away addition and subtraction.
   EIGEN_OPTIMIZATION_BARRIER(r);
   r = psub(r, limit);
   // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit),
-              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   return r;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1 << 23));
   const Packet2f abs_a = pabs(a);
   Packet2f r = padd(abs_a, limit);
   // Don't compile-away addition and subtraction.
   EIGEN_OPTIMIZATION_BARRIER(r);
   r = psub(r, limit);
   // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit),
-              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   return r;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp  = print<Packet4f>(a);
+  Packet4f tmp = print<Packet4f>(a);
   // If greater, subtract one.
   Packet4f mask = pcmp_lt(a, tmp);
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
   const Packet2f cst_1 = pset1<Packet2f>(1.0f);
-  Packet2f tmp  = print<Packet2f>(a);
+  Packet2f tmp = print<Packet2f>(a);
   // If greater, subtract one.
   Packet2f mask = pcmp_lt(a, tmp);
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp  = print<Packet4f>(a);
+  Packet4f tmp = print<Packet4f>(a);
   // If smaller, add one.
   Packet4f mask = pcmp_lt(tmp, a);
   mask = pand(mask, cst_1);
   return padd(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
   const Packet2f cst_1 = pset1<Packet2f>(1.0);
-  Packet2f tmp  = print<Packet2f>(a);
+  Packet2f tmp = print<Packet2f>(a);
   // If smaller, add one.
   Packet2f mask = pcmp_lt(tmp, a);
   mask = pand(mask, cst_1);
@@ -3226,12 +4548,12 @@
  *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
  *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
  */
-template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
   uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
   uint8x8_t res = vdup_n_u8(0);
   uint8x8_t add = vdup_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
+  for (int i = 0; i < 4; i++) {
     const uint8x8_t temp = vorr_u8(res, add);
     res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
     add = vshr_n_u8(add, 1);
@@ -3239,11 +4561,11 @@
   return vget_lane_u32(vreinterpret_u32_u8(res), 0);
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
   uint8x8_t res = vdup_n_u8(0);
   uint8x8_t add = vdup_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
+  for (int i = 0; i < 4; i++) {
     const uint8x8_t temp = vorr_u8(res, add);
     res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
     add = vshr_n_u8(add, 1);
@@ -3251,11 +4573,11 @@
   return res;
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
   uint8x16_t res = vdupq_n_u8(0);
   uint8x16_t add = vdupq_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
+  for (int i = 0; i < 4; i++) {
     const uint8x16_t temp = vorrq_u8(res, add);
     res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
     add = vshrq_n_u8(add, 1);
@@ -3263,11 +4585,11 @@
   return res;
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
   uint16x4_t res = vdup_n_u16(0);
   uint16x4_t add = vdup_n_u16(0x80);
-  for (int i = 0; i < 8; i++)
-  {
+  for (int i = 0; i < 8; i++) {
     const uint16x4_t temp = vorr_u16(res, add);
     res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
     add = vshr_n_u16(add, 1);
@@ -3275,11 +4597,11 @@
   return res;
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
   uint16x8_t res = vdupq_n_u16(0);
   uint16x8_t add = vdupq_n_u16(0x80);
-  for (int i = 0; i < 8; i++)
-  {
+  for (int i = 0; i < 8; i++) {
     const uint16x8_t temp = vorrq_u16(res, add);
     res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
     add = vshrq_n_u16(add, 1);
@@ -3287,11 +4609,11 @@
   return res;
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
   uint32x2_t res = vdup_n_u32(0);
   uint32x2_t add = vdup_n_u32(0x8000);
-  for (int i = 0; i < 16; i++)
-  {
+  for (int i = 0; i < 16; i++) {
     const uint32x2_t temp = vorr_u32(res, add);
     res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
     add = vshr_n_u32(add, 1);
@@ -3299,11 +4621,11 @@
   return res;
 }
 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
   uint32x4_t res = vdupq_n_u32(0);
   uint32x4_t add = vdupq_n_u32(0x8000);
-  for (int i = 0; i < 16; i++)
-  {
+  for (int i = 0; i < 16; i++) {
     const uint32x4_t temp = vorrq_u32(res, add);
     res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
     add = vshrq_n_u32(add, 1);
@@ -3329,7 +4651,8 @@
   return result;
 }
 
-template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
+template <typename Packet>
+Packet prsqrt_float_common(const Packet& a) {
   const Packet cst_zero = pzero(a);
   const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
   Packet return_zero = pcmp_eq(a, cst_inf);
@@ -3340,16 +4663,18 @@
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
   return prsqrt_float_common(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
   return prsqrt_float_common(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
   // Compute approximate reciprocal.
   float32x4_t result = vrecpeq_f32(a);
   result = vmulq_f32(vrecpsq_f32(a, result), result);
@@ -3357,8 +4682,8 @@
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
   // Compute approximate reciprocal.
   float32x2_t result = vrecpe_f32(a);
   result = vmul_f32(vrecps_f32(a, result), result);
@@ -3368,37 +4693,51 @@
 
 // Unfortunately vsqrt_f32 is only available for A64.
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return vsqrtq_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  return vsqrt_f32(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
+  return vdivq_f32(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
+  return vdiv_f32(a, b);
+}
 #else
-template<typename Packet>
+template <typename Packet>
 EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
   const Packet cst_zero = pzero(a);
   const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
-  
-  Packet result = pmul(a, prsqrt_float_unsafe(a));  
+
+  Packet result = pmul(a, prsqrt_float_unsafe(a));
   Packet a_is_zero = pcmp_eq(a, cst_zero);
   Packet a_is_inf = pcmp_eq(a, cst_inf);
   Packet return_a = por(a_is_zero, a_is_inf);
-  
+
   result = pselect(return_a, a, result);
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
   return psqrt_float_common(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
   return psqrt_float_common(a);
 }
 
-template<typename Packet>
+template <typename Packet>
 EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
   // if b is large, NEON intrinsics will flush preciprocal(b) to zero
   // avoid underflow with the following manipulation:
@@ -3407,18 +4746,20 @@
   const Packet cst_one = pset1<Packet>(1.0f);
   const Packet cst_quarter = pset1<Packet>(0.25f);
   const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
-  
+
   Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
   Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
   Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pdiv_float_common(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
   return pdiv_float_common(a, b);
 }
 #endif
@@ -3429,56 +4770,57 @@
 // TODO: Guard if we have native bfloat16 support
 typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
 
-template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4bf> {
+  enum { value = true };
+};
 
-template<> struct packet_traits<bfloat16> : default_packet_traits
-{
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
   typedef Packet4bf type;
   typedef Packet4bf half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
-    HasDiv       = 1,
-    HasFloor     = 1,
-    HasCeil      = 1,
-    HasRint      = 1,
+    HasBlend = 0,
+    HasDiv = 1,
+    HasFloor = 1,
+    HasCeil = 1,
+    HasRint = 1,
 
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 0,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf  = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
     HasNdtri = 0
   };
 };
 
-template<> struct unpacket_traits<Packet4bf>
-{
+template <>
+struct unpacket_traits<Packet4bf> {
   typedef bfloat16 type;
   typedef Packet4bf half;
-  enum
-  {
+  enum {
     size = 4,
     alignment = Aligned16,
     vectorizable = true,
@@ -3487,23 +4829,22 @@
   };
 };
 
-namespace detail {  
-template<>
+namespace detail {
+template <>
 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
   const uint16x4x2_t tmp = vzip_u16(p1, p2);
   p1 = tmp.val[0];
   p2 = tmp.val[1];
 }
-} // namespace detail
+}  // namespace detail
 
-EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
-{
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
   // See the scalar implementation in BFloat16.h for a comprehensible explanation
   // of this fast rounding algorithm
   Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
 
   // lsb = (input >> 16) & 1
-  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+  Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
 
   // rounding_bias = 0x7fff + lsb
   Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
@@ -3523,215 +4864,216 @@
   return vmovn_u32(input);
 }
 
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
-{
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
   return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
 }
 
-EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
-  return vmovn_u32(vreinterpretq_u32_f32(p));
-}
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
   return Packet4bf(pset1<Packet4us>(from.value));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
   return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
   return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
   return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
   EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
   EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
   return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
   return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
-                                                                            const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
-                                                                        const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
-                                                          const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
-                                                                            const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
-                                                                        const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
-                                                          const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
   return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
   return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
   return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
   return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
   return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
-                                                      const Packet4bf& b)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
   return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
   return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
   return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
   return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
   return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
 }
 
-template<>
-EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
   pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
   return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
   return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
   return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
   return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
   return Packet4bf(preverse<Packet4us>(Packet4us(a)));
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
   detail::ptranspose_impl(kernel);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
   return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
 }
 
@@ -3756,9 +5098,15 @@
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
 // This doesn't work with MSVC though, since the function names are macros.
-template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
+template <typename T>
+uint64x2_t vreinterpretq_u64_f64(T a) {
+  return (uint64x2_t)a;
+}
 
-template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+template <typename T>
+float64x2_t vreinterpretq_f64_u64(T a) {
+  return (float64x2_t)a;
+}
 #endif
 
 #if EIGEN_COMP_MSVC_STRICT
@@ -3777,85 +5125,73 @@
 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
 #endif
 
-
 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
 // for fast inversion of matrices of size 4.
-EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
-{
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
   const double* a = reinterpret_cast<const double*>(&m);
   const double* b = reinterpret_cast<const double*>(&n);
   Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
   return res;
 }
 
-EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
-{
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
   return shuffle(a, b, mask);
 }
-EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
-{
-  return shuffle(a, b, 0);
-}
-EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
-{
-  return shuffle(a, b, 3);
-}
-#define vec2d_duplane(a, p) \
-  Packet2d(vdupq_laneq_f64(a, p))
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
+#define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
 
-template<> struct packet_traits<double>  : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
   typedef Packet2d half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
+    HasBlend = 0,
 
-    HasDiv   = 1,
+    HasDiv = 1,
     HasFloor = 1,
     HasCeil = 1,
     HasRint = 1,
 
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-    HasExp  = 1,
-    HasLog  = 1,
+    HasExp = 1,
+    HasLog = 1,
     HasATan = 1,
 #endif
-    HasSin  = 0,
-    HasCos  = 0,
+    HasSin = 0,
+    HasCos = 0,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh = 0,
-    HasErf  = 0
+    HasErf = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2d>
-{
+template <>
+struct unpacket_traits<Packet2d> {
   typedef double type;
   typedef Packet2d half;
   typedef Packet2l integer_packet;
-  enum
-  {
+  enum {
     size = 2,
     alignment = Aligned16,
     vectorizable = true,
@@ -3864,149 +5200,239 @@
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vdupq_n_f64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
-{
-  const double c[] = {0.0,1.0};
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  const double c[] = {0.0, 1.0};
   return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vaddq_f64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vsubq_f64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
-template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
   const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
   return padd(a, pxor(mask, b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  return vnegq_f64(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmulq_f64(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vdivq_f64(a, b);
+}
 
 #ifdef __ARM_FEATURE_FMA
 // See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vfmaq_f64(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmaq_f64(c, a, b);
+}
 #else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vmlaq_f64(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlaq_f64(c, a, b);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vminq_f64(a, b);
+}
 
 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vminnmq_f64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmaxnmq_f64(a, b);
+}
 
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmin<Packet2d>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmaxq_f64(a, b);
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmax<Packet2d>(a, b);
+}
 
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vcleq_f64(a, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vcltq_f64(a, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vceqq_f64(a, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return vld1q_dup_f64(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
+}
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
   Packet2d res = pset1<Packet2d>(0.0);
-  res = vld1q_lane_f64(from + 0*stride, res, 0);
-  res = vld1q_lane_f64(from + 1*stride, res, 1);
+  res = vld1q_lane_f64(from + 0 * stride, res, 0);
+  res = vld1q_lane_f64(from + 1 * stride, res, 1);
   return res;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
-  vst1q_lane_f64(to + stride*0, from, 0);
-  vst1q_lane_f64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  vst1q_lane_f64(to + stride * 0, from, 0);
+  vst1q_lane_f64(to + stride * 1, from, 1);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return vgetq_lane_f64(a, 0);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return vabsq_f64(a);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
   return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{ return vaddvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  return vaddvq_f64(a);
+}
 
 // Other reduction functions:
 // mul
 #if EIGEN_COMP_CLANGAPPLE
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return (vget_low_f64(a) * vget_high_f64(a))[0];
+}
 #else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
+}
 #endif
 
 // min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{ return vminvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return vminvq_f64(a);
+}
 
 // max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{ return vmaxvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return vmaxvq_f64(a);
+}
 
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2d, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
   const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
 
@@ -4014,35 +5440,53 @@
   kernel.packet[1] = tmp2;
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
-{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
-{ return vrndnq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return vrndnq_f64(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{ return vrndmq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vrndmq_f64(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
-{ return vrndpq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vrndpq_f64(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
-{ return pldexp_generic(a, exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  return pldexp_generic(a, exponent);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
-{ return pfrexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
-{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return vreinterpretq_f64_u64(vdupq_n_u64(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
   // Do Newton iterations for 1/sqrt(x).
   return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
+  return vsqrtq_f64(_x);
+}
 
-#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+#endif  // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 // Do we have an fp16 types and supporting Neon intrinsics?
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
@@ -4119,7 +5563,7 @@
   };
 };
 
-template<>
+template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
   return vadd_f16(vget_low_f16(a), vget_high_f16(a));
 }
@@ -4229,14 +5673,27 @@
 }
 
 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vminnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminnmq_f16(a, b);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return pmin<Packet4hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return pmin<Packet8hf>(a, b);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
@@ -4249,14 +5706,27 @@
 }
 
 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmaxnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxnmq_f16(a, b);
+}
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return pmax<Packet4hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return pmax<Packet8hf>(a, b);
+}
 
 #define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
   template <>                                                                         \
@@ -4292,28 +5762,34 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
-{ return vrndnq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
+  return vrndnq_f16(a);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
-{ return vrndn_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
+  return vrndn_f16(a);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
-{ return vrndmq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
+  return vrndmq_f16(a);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
-{ return vrndm_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
+  return vrndm_f16(a);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
-{ return vrndpq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
+  return vrndpq_f16(a);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
-{ return vrndp_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
+  return vrndp_f16(a);
+}
 
 template <>
 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
@@ -4415,13 +5891,17 @@
 EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
   Packet4hf lo, hi;
   lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
-  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
+  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
   return vcombine_f16(lo, hi);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
+  return vsetq_lane_f16(b.x, a, 0);
+}
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
+  return vset_lane_f16(b.x, a, 0);
+}
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
@@ -4433,9 +5913,13 @@
   return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
+  return vsetq_lane_f16(b.x, a, 7);
+}
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
+  return vset_lane_f16(b.x, a, 3);
+}
 
 template <>
 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
@@ -4482,7 +5966,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
+                                                                            Index stride) {
   to[stride * 0].x = vgetq_lane_f16(from, 0);
   to[stride * 1].x = vgetq_lane_f16(from, 1);
   to[stride * 2].x = vgetq_lane_f16(from, 2);
@@ -4494,7 +5979,8 @@
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
+                                                                            Index stride) {
   to[stride * 0].x = vget_lane_f16(from, 0);
   to[stride * 1].x = vget_lane_f16(from, 1);
   to[stride * 2].x = vget_lane_f16(from, 2);
@@ -4524,7 +6010,8 @@
   return h;
 }
 
-template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
   float16x4_t a_lo, a_hi;
   Packet8hf a_r64;
 
@@ -4544,7 +6031,7 @@
   return vabsq_f16(a);
 }
 
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
   return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
 }
@@ -4556,7 +6043,7 @@
 
 template <>
 EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
-  return vreinterpret_f16_s16( vshr_n_s16( vreinterpret_s16_f16(a), 15)); 
+  return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
 }
 
 template <>
@@ -4636,8 +6123,7 @@
   return h;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
   const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
   const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
 
@@ -4690,10 +6176,10 @@
   kernel.packet[6] = T_3[1].val[1];
   kernel.packet[7] = T_3[3].val[1];
 }
-#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif  // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_NEON_H
+#endif  // EIGEN_PACKET_MATH_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 68566b0..58d7b8c 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -18,7 +18,6 @@
 
 namespace internal {
 
-
 //==============================================================================
 // preinterpret (truncation operations)
 //==============================================================================
@@ -93,7 +92,6 @@
   return Packet4f(vreinterpretq_f32_u32(a));
 }
 
-
 template <>
 EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
   return static_cast<Packet4c>(a);
@@ -107,7 +105,6 @@
   return Packet16c(vreinterpretq_s8_u8(a));
 }
 
-
 template <>
 EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
   return static_cast<Packet4uc>(a);
@@ -185,7 +182,6 @@
 // pcast, SrcType = float
 //==============================================================================
 
-
 template <>
 struct type_casting_traits<float, numext::int64_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
@@ -415,7 +411,6 @@
   return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))));
 }
 
-
 template <>
 struct type_casting_traits<numext::int8_t, numext::uint32_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
@@ -477,7 +472,6 @@
   return preinterpret<Packet4us>(pcast<Packet4c, Packet4s>(a));
 }
 
-
 //==============================================================================
 // pcast, SrcType = uint8_t
 //==============================================================================
@@ -577,7 +571,6 @@
   return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))));
 }
 
-
 template <>
 struct type_casting_traits<numext::uint8_t, numext::int16_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
@@ -595,7 +588,6 @@
   return preinterpret<Packet4s>(pcast<Packet4uc, Packet4us>(a));
 }
 
-
 //==============================================================================
 // pcast, SrcType = int16_t
 //==============================================================================
@@ -673,7 +665,6 @@
   return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
 }
 
-
 template <>
 struct type_casting_traits<numext::int16_t, numext::int8_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -794,7 +785,6 @@
   return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
 }
 
-
 template <>
 struct type_casting_traits<numext::uint16_t, numext::uint8_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -881,7 +871,6 @@
   return preinterpret<Packet2ul>(pcast<Packet2i, Packet2l>(a));
 }
 
-
 template <>
 struct type_casting_traits<numext::int32_t, numext::int16_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1013,7 +1002,6 @@
   return preinterpret<Packet2l>(pcast<Packet2ui, Packet2ul>(a));
 }
 
-
 template <>
 struct type_casting_traits<numext::uint32_t, numext::uint16_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1273,7 +1261,6 @@
 #endif
 }
 
-
 template <>
 struct type_casting_traits<numext::uint64_t, numext::uint32_t> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1407,7 +1394,6 @@
   return Packet4i(vreinterpretq_s32_f64(a));
 }
 
-
 template <>
 struct type_casting_traits<double, float> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1534,7 +1520,7 @@
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pcast<Packet2d, Packet8uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
-                                                           const Packet2d& d) {
+                                                         const Packet2d& d) {
   return preinterpret<Packet8uc>(pcast<Packet2d, Packet8c>(a, b, c, d));
 }
 template <>
diff --git a/Eigen/src/Core/arch/NEON/UnaryFunctors.h b/Eigen/src/Core/arch/NEON/UnaryFunctors.h
index 09da91c..8be5bb0 100644
--- a/Eigen/src/Core/arch/NEON/UnaryFunctors.h
+++ b/Eigen/src/Core/arch/NEON/UnaryFunctors.h
@@ -17,38 +17,31 @@
 
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 /** \internal
-  * \brief Template specialization of the logistic function for Eigen::half.
-  */
+ * \brief Template specialization of the logistic function for Eigen::half.
+ */
 template <>
 struct scalar_logistic_op<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Eigen::half operator()(const Eigen::half& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator()(const Eigen::half& x) const {
     // Convert to float and call scalar_logistic_op<float>.
     const scalar_logistic_op<float> float_op;
     return Eigen::half(float_op(float(x)));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Eigen::half packetOp(const Eigen::half& x) const {
-    return this->operator()(x);
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half packetOp(const Eigen::half& x) const { return this->operator()(x); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet4hf packetOp(const Packet4hf& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf packetOp(const Packet4hf& x) const {
     const scalar_logistic_op<float> float_op;
     return vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(x)));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet8hf packetOp(const Packet8hf& x) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf packetOp(const Packet8hf& x) const {
     const scalar_logistic_op<float> float_op;
-    return vcombine_f16(
-      vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))),
-      vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x))));
+    return vcombine_f16(vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))),
+                        vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x))));
   }
 };
 
-template<>
+template <>
 struct functor_traits<scalar_logistic_op<Eigen::half>> {
   enum {
     Cost = functor_traits<scalar_logistic_op<float>>::Cost,
@@ -57,8 +50,8 @@
 };
 #endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_NEON_UNARY_FUNCTORS_H
+#endif  // EIGEN_NEON_UNARY_FUNCTORS_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d068806..4c5c499 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -18,8 +18,7 @@
 namespace internal {
 
 //---------- float ----------
-struct Packet2cf
-{
+struct Packet2cf {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
   Packet4f v;
@@ -28,8 +27,8 @@
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
   typedef Packet2cf half;
   enum {
@@ -37,138 +36,179 @@
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0,
-    HasBlend  = 1
+    HasBlend = 1
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet2cf> {
+template <>
+struct unpacket_traits<Packet2cf> {
   typedef std::complex<float> type;
   typedef Packet2cf half;
   typedef Packet4f as_real;
   enum {
-    size=2,
-    alignment=Aligned16,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
-{
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
-  return Packet2cf(_mm_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_add_ps(a.v, b.v));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet2cf(_mm_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_sub_ps(a.v, b.v));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  #ifdef EIGEN_VECTORIZE_SSE3
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v, mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
-                                 _mm_mul_ps(_mm_movehdup_ps(a.v),
-                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-//   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-//                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-//                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-  #else
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
-  return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                              _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                    vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-  #endif
+                                 _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+  //   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+  //                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+  //                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+#else
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000));
+  return Packet2cf(
+      _mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+                 _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(ptrue(Packet4f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_andnot_ps(b.v, a.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   const float re = std::real(from);
   const float im = std::imag(from);
   return Packet2cf(_mm_set_ps(im, re, im, re));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
-
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
-                              std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
-  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
-  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
-                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet2cf(_mm_set_ps(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+                              std::real(from[0 * stride])));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                       _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride * 1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                       _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
   alignas(alignof(__m64)) std::complex<float> res;
   _mm_storel_pi((__m64*)&res, a.v);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
-  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v)))));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
-  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v, a.v))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v, a.v))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
   return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   return pdiv_complex(a, b);
 }
 
 //---------- double ----------
-struct Packet1cd
-{
+struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
   Packet2d v;
@@ -177,8 +217,8 @@
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet1cd type;
   typedef Packet1cd half;
   enum {
@@ -186,112 +226,155 @@
     AlignedOnScalar = 0,
     size = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet1cd> {
+template <>
+struct unpacket_traits<Packet1cd> {
   typedef std::complex<double> type;
   typedef Packet1cd half;
   typedef Packet2d as_real;
   enum {
-    size=1,
-    alignment=Aligned16,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
   };
 };
 
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-  return Packet1cd(_mm_xor_pd(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000, 0x0, 0x0, 0x0));
+  return Packet1cd(_mm_xor_pd(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  #ifdef EIGEN_VECTORIZE_SSE3
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
   return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
-                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                            vec2d_swizzle1(b.v, 1, 0))));
-  #else
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0))));
+#else
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
   return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                    vec2d_swizzle1(b.v, 1, 0)), mask)));
-  #endif
+                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask)));
+#endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
+  return Packet1cd(ptrue(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_andnot_pd(b.v, a.v));
+}
 
 // FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
 
 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 double res[2];
   _mm_store_pd(res, a.v);
-  return std::complex<double>(res[0],res[1]);
+  return std::complex<double>(res[0], res[1]);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
   return pfirst(a);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
   return pfirst(a);
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   return pdiv_complex(a, b);
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
   return Packet1cd(preverse(Packet2d(x.v)));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
   __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
 
@@ -300,32 +383,36 @@
   kernel.packet[1].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
   __m128 eq = _mm_cmpeq_ps(a.v, b.v);
   return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
   __m128d eq = _mm_cmpeq_pd(a.v, b.v);
   return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
 }
 
-template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
   __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
   return Packet2cf(_mm_castpd_ps(result));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
   return psqrt_complex<Packet1cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
   return psqrt_complex<Packet2cf>(a);
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_SSE_H
+#endif  // EIGEN_COMPLEX_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 0f86bcf..30c1f07 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -29,17 +29,23 @@
 // iteration for square root. In particular, Skylake and Zen2 processors
 // have approximately doubled throughput of the _mm_sqrt_ps instruction
 // compared to their predecessors.
-template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  return _mm_sqrt_ps(x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+  return _mm_sqrt_pd(x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt<Packet16b>(const Packet16b& x) {
+  return x;
+}
 
 #if EIGEN_FAST_MATH
 // Even on Skylake, using Newton iteration is a win for reciprocal square root.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
 }
 
@@ -47,28 +53,25 @@
 // Trying to speed up reciprocal using Newton-Raphson is counterproductive
 // unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
 // 30% faster.
-template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
   return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
 }
 #endif
 
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
 namespace numext {
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float sqrt(const float &x)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float& x) {
   return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
 }
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double sqrt(const double &x)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double& x) {
 #if EIGEN_COMP_GNUC_STRICT
   // This works around a GCC bug generating poor code for _mm_sqrt_pd
   // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
@@ -78,8 +81,8 @@
 #endif
 }
 
-} // end namespace numex
+}  // namespace numext
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_SSE_H
+#endif  // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 8dd553d..be8183c 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -25,7 +25,7 @@
 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
 // 32 bits =>  8 registers
 // 64 bits => 16 registers
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
 #endif
 
 #ifdef EIGEN_VECTORIZE_FMA
@@ -34,16 +34,18 @@
 #endif
 #endif
 
-#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
+     (__GXX_ABI_VERSION < 1004)) ||                                                                     \
+    EIGEN_OS_QNX
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
 // structure:
-typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128> Packet4f;
 typedef eigen_packet_wrapper<__m128d> Packet2d;
 #else
-typedef __m128  Packet4f;
+typedef __m128 Packet4f;
 typedef __m128d Packet2d;
 #endif
 
@@ -51,87 +53,90 @@
 typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
 typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
 
-template<> struct is_arithmetic<__m128>  { enum { value = true }; };
-template<> struct is_arithmetic<__m128i> { enum { value = true }; };
-template<> struct is_arithmetic<__m128d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
+template <>
+struct is_arithmetic<__m128> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet4i> {
+  enum { value = true };
+};
 // Note that `Packet4ui` uses the underlying type `__m128i`, which is
 // interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
 // operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet4ui> { enum { value = false }; };
-template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4ui> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet16b> {
+  enum { value = true };
+};
 
-template<int p, int q, int r, int s>
-struct shuffle_mask{
- enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+template <int p, int q, int r, int s>
+struct shuffle_mask {
+  enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
 };
 
 // TODO: change the implementation of all swizzle* ops from macro to template,
-#define vec4f_swizzle1(v,p,q,r,s) \
-  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
+#define vec4f_swizzle1(v, p, q, r, s) \
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
 
-#define vec4i_swizzle1(v,p,q,r,s) \
-  Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
+#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
 
-#define vec4ui_swizzle1(v, p, q, r, s) \
-  Packet4ui(vec4i_swizzle1(v,p,q,r,s))
+#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
 
-#define vec2d_swizzle1(v,p,q) \
-  Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
+#define vec2d_swizzle1(v, p, q) \
+  Packet2d(_mm_castsi128_pd(    \
+      _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
 
-#define vec4f_swizzle2(a,b,p,q,r,s) \
-  Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
+#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
 
-#define vec4i_swizzle2(a,b,p,q,r,s) \
-  Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+#define vec4i_swizzle2(a, b, p, q, r, s) \
+  Packet4i(                              \
+      _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
 
-#define vec4ui_swizzle2(a,b,p,q,r,s) \
-  Packet4i(vec4i_swizzle2(a,b,p,q,r,s))
+#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
 
-EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
-{
-  return Packet4f(_mm_movelh_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_movelh_ps(a, b));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
-{
-  return Packet4f(_mm_movehl_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_movehl_ps(a, b));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
-{
-  return Packet4f(_mm_unpacklo_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_unpacklo_ps(a, b));
 }
-EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
-{
-  return Packet4f(_mm_unpackhi_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_unpackhi_ps(a, b));
 }
-#define vec4f_duplane(a,p) \
-  vec4f_swizzle2(a,a,p,p,p,p)
+#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
 
-#define vec2d_swizzle2(a,b,mask) \
-  Packet2d(_mm_shuffle_pd(a,b,mask))
+#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
 
-EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
-{
-  return Packet2d(_mm_unpacklo_pd(a,b));
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
+  return Packet2d(_mm_unpacklo_pd(a, b));
 }
-EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
-{
-  return Packet2d(_mm_unpackhi_pd(a,b));
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
+  return Packet2d(_mm_unpackhi_pd(a, b));
 }
-#define vec2d_duplane(a,p) \
-  vec2d_swizzle2(a,a,(p<<1)|p)
+#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
 
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
-  const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
 #define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
 
@@ -147,7 +152,7 @@
     AlignedOnScalar = 1,
     size = 4,
 
-    HasCmp  = 1,
+    HasCmp = 1,
     HasDiv = 1,
     HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
@@ -173,7 +178,7 @@
     HasRound = 1,
 #endif
     HasRint = 1,
-    HasSign = 0   // The manually vectorized version is slightly slower for SSE.
+    HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
 };
 template <>
@@ -183,12 +188,12 @@
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=2,
+    size = 2,
 
-    HasCmp  = 1,
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasATan = 1,
@@ -201,23 +206,23 @@
     HasRint = 1
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     HasCmp = 1,
-    HasDiv=1,
-    size=4,
+    HasDiv = 1,
+    size = 4,
 
     HasShift = 1,
     HasBlend = 1
   };
 };
-template<> struct packet_traits<uint32_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
   typedef Packet4ui type;
   typedef Packet4ui half;
   enum {
@@ -236,81 +241,167 @@
   };
 };
 #endif
-template<> struct packet_traits<bool> : default_packet_traits
-{
+template <>
+struct packet_traits<bool> : default_packet_traits {
   typedef Packet16b type;
   typedef Packet16b half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=16,
+    size = 16,
 
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasCmp       = 1, // note -- only pcmp_eq is defined
-    HasShift     = 0,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 0,
-    HasAbs2      = 0,
-    HasMin       = 0,
-    HasMax       = 0,
-    HasConj      = 0,
-    HasSqrt      = 1,
-    HasSign      = 0   // Don't try to vectorize psign<bool> = identity.
+    HasAdd = 1,
+    HasSub = 1,
+    HasCmp = 1,  // note -- only pcmp_eq is defined
+    HasShift = 0,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSqrt = 1,
+    HasSign = 0  // Don't try to vectorize psign<bool> = identity.
   };
 };
 
-template<> struct unpacket_traits<Packet4f> {
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet4i  integer_packet;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet2d> {
-  typedef double    type;
-  typedef Packet2d  half;
-  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  typedef Packet2d half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet4i> {
-  typedef int       type;
-  typedef Packet4i  half;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet4ui> {
+template <>
+struct unpacket_traits<Packet4ui> {
   typedef uint32_t type;
   typedef Packet4ui half;
-  enum {size = 4, alignment = Aligned16, vectorizable = true, masked_load_available = false, masked_store_available = false};
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
-template<> struct unpacket_traits<Packet16b> {
-  typedef bool       type;
-  typedef Packet16b  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet16b> {
+  typedef bool type;
+  typedef Packet16b half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
 };
 
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
-template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+template <>
+struct scalar_div_cost<float, true> {
+  enum { value = 7 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+  enum { value = 8 };
+};
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return _mm_set1_epi32(numext::bit_cast<int32_t>(from)); }
-template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool&    from) { return _mm_set1_epi8(static_cast<char>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return _mm_set_ps1(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return _mm_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+  return _mm_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
+  return _mm_set1_epi8(static_cast<char>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return _mm_castsi128_ps(pset1<Packet4i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return _mm_castsi128_pd(_mm_set1_epi64x(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
-template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
+  return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
+  return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
+  return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
+  return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
-template<> EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) { return _mm_setzero_si128(); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
+  return _mm_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
+  return _mm_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
+  return _mm_setzero_si128();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
+  return _mm_setzero_si128();
+}
 
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -318,242 +409,455 @@
 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
 // Also note that with AVX, we want it to generate a vbroadcastss.
 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
-template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
-  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) { return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_add_epi32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_add_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_add_epi32(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_or_si128(a, b);
+}
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
-template<> EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ss(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_sd(a,b); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_add_ss(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_add_sd(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_sub_epi32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_xor_si128(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
-template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
 #ifdef EIGEN_VECTORIZE_SSE3
-  return _mm_addsub_ps(a,b);
+  return _mm_addsub_ps(a, b);
 #else
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
-template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
 #ifdef EIGEN_VECTORIZE_SSE3
-  return _mm_addsub_pd(a,b); 
+  return _mm_addsub_pd(a, b);
 #else
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
   return padd(a, pxor(mask, b));
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
-  return _mm_xor_ps(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  return _mm_xor_ps(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
-  return _mm_xor_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
+  return _mm_xor_pd(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
-  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return psub(Packet4i(_mm_setr_epi32(0, 0, 0, 0)), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) {
   return psub(pset1<Packet16b>(false), a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_mullo_epi32(a,b);
+  return _mm_mullo_epi32(a, b);
 #else
   // this version is slightly faster than 4 scalar products
   return vec4i_swizzle1(
-            vec4i_swizzle2(
-              _mm_mul_epu32(a,b),
-              _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
-                            vec4i_swizzle1(b,1,0,3,2)),
-              0,2,0,2),
-            0,2,1,3);
+      vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
+                     0, 2, 0, 2),
+      0, 2, 1, 3);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_mullo_epi32(a,b);
+  return _mm_mullo_epi32(a, b);
 #else
   // this version is slightly faster than 4 scalar products
   return vec4ui_swizzle1(
-            vec4ui_swizzle2(
-              _mm_mul_epu32(a,b),
-              _mm_mul_epu32(vec4ui_swizzle1(a,1,0,3,2),
-                            vec4ui_swizzle1(b,1,0,3,2)),
-              0,2,0,2),
-            0,2,1,3);
+      vec4ui_swizzle2(_mm_mul_epu32(a, b),
+                      _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
+      0, 2, 1, 3);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_and_si128(a, b);
+}
 
 template <>
-EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a,
-                                            const Packet4i& b) {
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_div_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
 #ifdef EIGEN_VECTORIZE_AVX
-  return _mm256_cvttpd_epi32(
-      _mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
+  return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
 #else
   __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
-  __m128i q_hi =
-      _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)),
-                                 _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
+  __m128i q_hi = _mm_cvttpd_epi32(
+      _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
   return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
 #endif
 }
 
-
 // for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { return padd(pmul(a, b), c); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return padd(pmul(a, b), c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+  return padd(pmul(a, b), c);
+}
 #ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmsub_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmsub_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmadd_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmsub_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmsub_pd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmadd_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmsub_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fnmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fnmadd_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fnmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fnmsub_pd(a, b, c);
+}
 
-template<typename Packet> EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
-template<> EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ss(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_sd(a,b,c); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmadd_ss(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmadd_sd(a, b, c);
+}
 #endif
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
-  return _mm_blendv_ps(b,a,mask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b, a, mask);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
-  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
-  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+template <>
+EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return _mm_blendv_pd(b, a, mask);
+}
 
-template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
-  return _mm_blendv_epi8(b,a,mask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+  return _mm_blendv_epi8(b, a, mask);
 }
 #else
-template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
   Packet16b a_part = _mm_and_si128(mask, a);
   Packet16b b_part = _mm_andnot_si128(mask, b);
   return _mm_or_si128(a_part, b_part);
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
-template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
-template<> EIGEN_STRONG_INLINE Packet4f
-ptrue<Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
+  return _mm_cmpeq_epi32(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) {
+  return _mm_cmpeq_epi8(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
   Packet4i b = _mm_castps_si128(a);
   return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
 }
-template<> EIGEN_STRONG_INLINE Packet2d
-ptrue<Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
   Packet4i b = _mm_castpd_si128(a);
   return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
 }
 
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_and_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_and_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_and_si128(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_and_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_or_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_or_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_or_si128(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_or_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_xor_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_xor_si128(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_xor_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_andnot_si128(b, a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_andnot_si128(b, a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmple_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmplt_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmpnge_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmpeq_ps(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmple_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmplt_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmpnge_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmpeq_pd(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+  return _mm_cmplt_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+  return _mm_cmpeq_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_cmpeq_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
+  return _mm_cmpeq_epi8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+  return por(pcmp_lt(a, b), pcmp_eq(a, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { return _mm_cmpeq_epi32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
 // There appears to be a bug in GCC, by which the optimizer may
 // flip the argument order in calls to _mm_min_ps, so we have to
 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
 #ifdef EIGEN_VECTORIZE_AVX
   Packet4f res;
-  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
 #else
   Packet4f res = b;
-  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
 #endif
   return res;
 #else
@@ -561,18 +865,19 @@
   return _mm_min_ps(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
 // There appears to be a bug in GCC, by which the optimizer may
 // flip the argument order in calls to _mm_min_pd, so we have to
 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
 #ifdef EIGEN_VECTORIZE_AVX
   Packet2d res;
-  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
 #else
   Packet2d res = b;
-  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
 #endif
   return res;
 #else
@@ -580,17 +885,18 @@
   return _mm_min_pd(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_min_epi32(a,b);
+  return _mm_min_epi32(a, b);
 #else
   // after some bench, this version *is* faster than a scalar implementation
-  Packet4i mask = _mm_cmplt_epi32(a,b);
-  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+  Packet4i mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
   return _mm_min_epu32(a, b);
 #else
@@ -600,19 +906,19 @@
 #endif
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
 // There appears to be a bug in GCC, by which the optimizer may
 // flip the argument order in calls to _mm_max_ps, so we have to
 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
 #ifdef EIGEN_VECTORIZE_AVX
   Packet4f res;
-  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
 #else
   Packet4f res = b;
-  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
 #endif
   return res;
 #else
@@ -620,18 +926,19 @@
   return _mm_max_ps(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
 // There appears to be a bug in GCC, by which the optimizer may
 // flip the argument order in calls to _mm_max_pd, so we have to
 // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
 // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
 #ifdef EIGEN_VECTORIZE_AVX
   Packet2d res;
-  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
 #else
   Packet2d res = b;
-  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+  asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
 #endif
   return res;
 #else
@@ -639,17 +946,18 @@
   return _mm_max_pd(b, a);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_max_epi32(a,b);
+  return _mm_max_epi32(a, b);
 #else
   // after some bench, this version *is* faster than a scalar implementation
-  Packet4i mask = _mm_cmpgt_epi32(a,b);
-  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+  Packet4i mask = _mm_cmpgt_epi32(a, b);
+  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
   return _mm_max_epu32(a, b);
 #else
@@ -659,7 +967,8 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
   return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
 #else
@@ -667,7 +976,8 @@
                             (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
   return pcmp_eq(a, pmin(a, b));
 #else
@@ -695,167 +1005,212 @@
 }
 
 // Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
   return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
   return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet4f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
   return pminmax_propagate_nan(a, b, pmin<Packet2d>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet4f>);
 }
-template<>
+template <>
 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
   return pminmax_propagate_nan(a, b, pmax<Packet2d>);
 }
 
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right   (const Packet4i& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left    (const Packet4i& a) { return _mm_slli_epi32(a,N); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right   (const Packet4ui& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left    (const Packet4ui& a) { return _mm_slli_epi32(a,N); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm_and_ps(a,mask);
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return _mm_srai_epi32(a, N);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm_and_pd(a,mask);
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return _mm_srli_epi32(a, N);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return _mm_slli_epi32(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
+  return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+  return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+  return _mm_slli_epi32(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+  return _mm_and_ps(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+  return _mm_and_pd(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
 #ifdef EIGEN_VECTORIZE_SSSE3
   return _mm_abs_epi32(a);
 #else
-  Packet4i aux = _mm_srai_epi32(a,31);
-  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+  Packet4i aux = _mm_srai_epi32(a, 31);
+  return _mm_sub_epi32(_mm_xor_si128(a, aux), aux);
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); }
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
   Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
 #ifdef EIGEN_VECTORIZE_AVX
   return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
 #else
   return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
-#endif // EIGEN_VECTORIZE_AVX
+#endif  // EIGEN_VECTORIZE_AVX
 }
-template<> EIGEN_STRONG_INLINE Packet4ui  psignbit(const Packet4ui& a)  { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
+  return pzero(a);
+}
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
   // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
   const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
   const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
   return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
   const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
   const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
   return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return _mm_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return _mm_ceil_pd(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return _mm_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return _mm_floor_pd(a);
+}
 #else
-template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
   const Packet4f abs_a = pabs(a);
   Packet4f r = padd(abs_a, limit);
   // Don't compile-away addition and subtraction.
   EIGEN_OPTIMIZATION_BARRIER(r);
   r = psub(r, limit);
   // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit),
-              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   return r;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
   // Adds and subtracts signum(a) * 2^52 to force rounding.
-  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull << 52));
   const Packet2d abs_a = pabs(a);
   Packet2d r = padd(abs_a, limit);
   // Don't compile-away addition and subtraction.
   EIGEN_OPTIMIZATION_BARRIER(r);
   r = psub(r, limit);
   // If greater than limit, simply return a.  Otherwise, account for sign.
-  r = pselect(pcmp_lt(abs_a, limit),
-              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   return r;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp  = print<Packet4f>(a);
+  Packet4f tmp = print<Packet4f>(a);
   // If greater, subtract one.
   Packet4f mask = _mm_cmpgt_ps(tmp, a);
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp  = print<Packet2d>(a);
+  Packet2d tmp = print<Packet2d>(a);
   // If greater, subtract one.
   Packet2d mask = _mm_cmpgt_pd(tmp, a);
   mask = pand(mask, cst_1);
   return psub(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4f tmp  = print<Packet4f>(a);
+  Packet4f tmp = print<Packet4f>(a);
   // If smaller, add one.
   Packet4f mask = _mm_cmplt_ps(tmp, a);
   mask = pand(mask, cst_1);
   return padd(tmp, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
   const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet2d tmp  = print<Packet2d>(a);
+  Packet2d tmp = print<Packet2d>(a);
   // If smaller, add one.
   Packet2d mask = _mm_cmplt_pd(tmp, a);
   mask = pand(mask, cst_1);
@@ -863,71 +1218,104 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool*     from) { EIGEN_DEBUG_ALIGNED_LOAD return  _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
 
 #if EIGEN_COMP_MSVC
-  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_ps(from);
 }
 #else
 // NOTE: with the code below, MSVC's compiler crashes!
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_ps(from);
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_pd(from);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
-template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool*     from) {
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
 
 // Load lower part of packet zero extending.
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
-template<> EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))); }
-template<> EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
 
 // Load scalar
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
-template<> EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
   return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return pset1<Packet2d>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
   Packet4i tmp;
   tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
-template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
   Packet4ui tmp;
   tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
   return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
@@ -935,154 +1323,268 @@
 
 // Loads 8 bools from memory and returns the packet
 // {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
-template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
   __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
-  return  _mm_unpacklo_epi8(tmp, tmp);
+  return _mm_unpacklo_epi8(tmp, tmp);
 }
 
 // Loads 4 bools from memory and returns the packet
 // {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
-template<> EIGEN_STRONG_INLINE Packet16b
-ploadquad<Packet16b>(const bool* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
   __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
   tmp = _mm_unpacklo_epi8(tmp, tmp);
-  return  _mm_unpacklo_epi16(tmp, tmp);
+  return _mm_unpacklo_epi16(tmp, tmp);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
-template<> EIGEN_STRONG_INLINE void pstorel(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstorel(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from); }
-
-template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
-template<> EIGEN_STRONG_INLINE void pstores(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from); }
-template<> EIGEN_STRONG_INLINE void pstores(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  return _mm_set_pd(from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
+  return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
   return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
                        numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
-{
-  return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
-                      from[11*stride], from[10*stride], from[9*stride], from[8*stride],
-                      from[7*stride], from[6*stride], from[5*stride], from[4*stride],
-                      from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
+  return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
+                      from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
+                      from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
+                      from[0 * stride]);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  to[stride*0] = _mm_cvtss_f32(from);
-  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
-  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
-  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  to[stride * 0] = _mm_cvtss_f32(from);
+  to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
-  to[stride*0] = _mm_cvtsd_f64(from);
-  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  to[stride * 0] = _mm_cvtsd_f64(from);
+  to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
-  to[stride*0] = _mm_cvtsi128_si32(from);
-  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
+  to[stride * 0] = _mm_cvtsi128_si32(from);
+  to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
   to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
   to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
   to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
   to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
-{
-  to[4*stride*0] = _mm_cvtsi128_si32(from);
-  to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
-  to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
-  to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
+  to[4 * stride * 0] = _mm_cvtsi128_si32(from);
+  to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[4 * stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
 
-
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
   Packet4f pa = _mm_set_ss(a);
-  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+  pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
   Packet2d pa = _mm_set_sd(a);
-  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+  pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
 }
 
 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
-typedef const void * SsePrefetchPtrType;
+typedef const void* SsePrefetchPtrType;
 #else
-typedef const char * SsePrefetchPtrType;
+typedef const char* SsePrefetchPtrType;
 #endif
 
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 #endif
 
 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 // Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-template<> EIGEN_STRONG_INLINE uint32_t    pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return a.m128_f32[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return a.m128d_f64[0];
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+  return x;
+}
 #elif EIGEN_COMP_MSVC_STRICT
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  float x = _mm_cvtss_f32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  double x = _mm_cvtsd_f64(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+  return x;
+}
 #else
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return _mm_cvtss_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return _mm_cvtsd_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  return _mm_cvtsi128_si32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+}
 #endif
-template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
+template <>
+EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return static_cast<bool>(x & 1);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { return _mm_shuffle_epi32(a, 0x1B); }
-template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return _mm_shuffle_ps(a, a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return _mm_shuffle_pd(a, a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+  return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
 #ifdef EIGEN_VECTORIZE_SSSE3
   __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   return _mm_shuffle_epi8(a, mask);
@@ -1093,30 +1595,33 @@
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
 }
 
 // Extract exponent without existence of Packet2l.
-template<>
-EIGEN_STRONG_INLINE  
-Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
-  const Packet2d cst_exp_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
   __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
   return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
 }
 
 // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
 // supported by SSE, and has more range than is needed for exponents.
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
   // Clamp exponent to [-2099, 2099]
   const Packet2d max_exponent = pset1<Packet2d>(2099.0);
   const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
@@ -1126,226 +1631,223 @@
 
   // Split 2^e into four factors and multiply:
   const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
-  Packet4i b = parithmetic_shift_right<2>(ei);  // floor(e/4)
+  Packet4i b = parithmetic_shift_right<2>(ei);                       // floor(e/4)
   Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
-  Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
-  b = psub(psub(psub(ei, b), b), b);  // e - 3b
-  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^(e - 3b)
-  out = pmul(out, c);  // a * 2^e
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                       // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);                                 // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));           // 2^(e - 3b)
+  out = pmul(out, c);                                                // a * 2^e
   return out;
 }
 
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
   a3 = pload<Packet4f>(a);
-  a0 = vec4f_swizzle1(a3, 0,0,0,0);
-  a1 = vec4f_swizzle1(a3, 1,1,1,1);
-  a2 = vec4f_swizzle1(a3, 2,2,2,2);
-  a3 = vec4f_swizzle1(a3, 3,3,3,3);
+  a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
+  a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
+  a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
+  a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
 }
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
-                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
 #ifdef EIGEN_VECTORIZE_SSE3
-  a0 = _mm_loaddup_pd(a+0);
-  a1 = _mm_loaddup_pd(a+1);
-  a2 = _mm_loaddup_pd(a+2);
-  a3 = _mm_loaddup_pd(a+3);
+  a0 = _mm_loaddup_pd(a + 0);
+  a1 = _mm_loaddup_pd(a + 1);
+  a2 = _mm_loaddup_pd(a + 2);
+  a3 = _mm_loaddup_pd(a + 3);
 #else
   a1 = pload<Packet2d>(a);
-  a0 = vec2d_swizzle1(a1, 0,0);
-  a1 = vec2d_swizzle1(a1, 1,1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec2d_swizzle1(a3, 0,0);
-  a3 = vec2d_swizzle1(a3, 1,1);
+  a0 = vec2d_swizzle1(a1, 0, 0);
+  a1 = vec2d_swizzle1(a1, 1, 1);
+  a3 = pload<Packet2d>(a + 2);
+  a2 = vec2d_swizzle1(a3, 0, 0);
+  a3 = vec2d_swizzle1(a3, 1, 1);
 #endif
 }
 #endif
 
-EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
-{
+EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
   vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
   vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
   // (from Nehalem to Haswell)
   // #ifdef EIGEN_VECTORIZE_SSE3
   //   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
   //   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
   // #else
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
   // #endif
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
   // (from Nehalem to Haswell)
   // #ifdef EIGEN_VECTORIZE_SSE3
   //   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
   // #else
-  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
   // #endif
 }
 
 #ifdef EIGEN_VECTORIZE_SSSE3
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  Packet4i tmp0 = _mm_hadd_epi32(a,a);
-  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+  Packet4i tmp0 = _mm_hadd_epi32(a, a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
   Packet4ui tmp0 = _mm_hadd_epi32(a, a);
   return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
 }
 
 #else
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
   Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
   return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+template <>
+EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
   return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
 }
 
 // Other reduction functions:
 
-
 // mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
-  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., reusing pmul is very slow !)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., reusing pmul is very slow !)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 uint32_t aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+  return (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
-  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
-  return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
-          (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+template <>
+EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
 }
 
 // min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
-  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
 }
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
 #else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
-  return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+  int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
+  return aux0 < aux2 ? aux0 : aux2;
+#endif  // EIGEN_VECTORIZE_SSE4_1
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4ui>(_mm_min_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
 #else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 uint32_t aux[4];
   pstore(aux, a);
-  uint32_t aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
-  return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+  uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
+  uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
+  return aux0 < aux2 ? aux0 : aux2;
+#endif  // EIGEN_VECTORIZE_SSE4_1
 }
 
 // max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
 }
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
-  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
 }
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
 #else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
-  return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+  int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
+  return aux0 > aux2 ? aux0 : aux2;
+#endif  // EIGEN_VECTORIZE_SSE4_1
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
 #ifdef EIGEN_VECTORIZE_SSE4_1
-  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst<Packet4ui>(_mm_max_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
 #else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 uint32_t aux[4];
   pstore(aux, a);
-  uint32_t aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
-  uint32_t aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
-  return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+  uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
+  uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
+  return aux0 > aux2 ? aux0 : aux2;
+#endif  // EIGEN_VECTORIZE_SSE4_1
 }
 
 // not needed yet
@@ -1354,34 +1856,31 @@
 //   return _mm_movemask_ps(x) == 0xF;
 // }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
   return _mm_movemask_ps(x) != 0x0;
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
   return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
 }
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
   return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
   kernel.packet[1] = tmp;
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
   __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
   __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
   __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
@@ -1396,20 +1895,18 @@
   ptranspose((PacketBlock<Packet4i, 4>&)kernel);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16b,4>& kernel) {
-  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
-  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
-  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
-  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
   kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
   kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
   kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
   kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16b,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
   // If we number the elements in the input thus:
   // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
   // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
@@ -1421,67 +1918,72 @@
   // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
   // ...
   // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
-  __m128i t0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  __m128i t1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
-  __m128i t2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ...                     27 37
-  __m128i t3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ...                     2f 3f
-  __m128i t4 =  _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52                         47 57
-  __m128i t5 =  _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
-  __m128i t6 =  _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
-  __m128i t7 =  _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
-  __m128i t8 =  _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
-  __m128i t9 =  _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
-  __m128i ta =  _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
-  __m128i tb =  _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
-  __m128i tc =  _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
-  __m128i td =  _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
-  __m128i te =  _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
-  __m128i tf =  _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i t0 =
+      _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =
+      _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);  // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =
+      _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);  // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =
+      _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);  // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =
+      _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);  // 40 50 41 51 42 52                         47 57
+  __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);  // 48 58 49 59 4a 5a
+  __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
 
-  __m128i s0 =  _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  __m128i s1 =  _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
-  __m128i s2 =  _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
-  __m128i s3 =  _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
-  __m128i s4 =  _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-  __m128i s5 =  _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
-  __m128i s6 =  _mm_unpacklo_epi16(t5, t7);
-  __m128i s7 =  _mm_unpackhi_epi16(t5, t7);
-  __m128i s8 =  _mm_unpacklo_epi16(t8, ta);
-  __m128i s9 =  _mm_unpackhi_epi16(t8, ta);
-  __m128i sa =  _mm_unpacklo_epi16(t9, tb);
-  __m128i sb =  _mm_unpackhi_epi16(t9, tb);
-  __m128i sc =  _mm_unpacklo_epi16(tc, te);
-  __m128i sd =  _mm_unpackhi_epi16(tc, te);
-  __m128i se =  _mm_unpacklo_epi16(td, tf);
-  __m128i sf =  _mm_unpackhi_epi16(td, tf);
+  __m128i s0 = _mm_unpacklo_epi16(t0, t2);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 = _mm_unpackhi_epi16(t0, t2);  // 04 14 24 34
+  __m128i s2 = _mm_unpacklo_epi16(t1, t3);  // 08 18 28 38 ...
+  __m128i s3 = _mm_unpackhi_epi16(t1, t3);  // 0c 1c 2c 3c ...
+  __m128i s4 = _mm_unpacklo_epi16(t4, t6);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 = _mm_unpackhi_epi16(t4, t6);  // 44 54 64 74 ...
+  __m128i s6 = _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 = _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 = _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 = _mm_unpackhi_epi16(t8, ta);
+  __m128i sa = _mm_unpacklo_epi16(t9, tb);
+  __m128i sb = _mm_unpackhi_epi16(t9, tb);
+  __m128i sc = _mm_unpacklo_epi16(tc, te);
+  __m128i sd = _mm_unpackhi_epi16(tc, te);
+  __m128i se = _mm_unpacklo_epi16(td, tf);
+  __m128i sf = _mm_unpackhi_epi16(td, tf);
 
-  __m128i u0 =  _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  __m128i u1 =  _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-  __m128i u2 =  _mm_unpacklo_epi32(s1, s5);
-  __m128i u3 =  _mm_unpackhi_epi32(s1, s5);
-  __m128i u4 =  _mm_unpacklo_epi32(s2, s6);
-  __m128i u5 =  _mm_unpackhi_epi32(s2, s6);
-  __m128i u6 =  _mm_unpacklo_epi32(s3, s7);
-  __m128i u7 =  _mm_unpackhi_epi32(s3, s7);
-  __m128i u8 =  _mm_unpacklo_epi32(s8, sc);
-  __m128i u9 =  _mm_unpackhi_epi32(s8, sc);
-  __m128i ua =  _mm_unpacklo_epi32(s9, sd);
-  __m128i ub =  _mm_unpackhi_epi32(s9, sd);
-  __m128i uc =  _mm_unpacklo_epi32(sa, se);
-  __m128i ud =  _mm_unpackhi_epi32(sa, se);
-  __m128i ue =  _mm_unpacklo_epi32(sb, sf);
-  __m128i uf =  _mm_unpackhi_epi32(sb, sf);
+  __m128i u0 = _mm_unpacklo_epi32(s0, s4);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 = _mm_unpackhi_epi32(s0, s4);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 = _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 = _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 = _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 = _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 = _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 = _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 = _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 = _mm_unpackhi_epi32(s8, sc);
+  __m128i ua = _mm_unpacklo_epi32(s9, sd);
+  __m128i ub = _mm_unpackhi_epi32(s9, sd);
+  __m128i uc = _mm_unpacklo_epi32(sa, se);
+  __m128i ud = _mm_unpackhi_epi32(sa, se);
+  __m128i ue = _mm_unpacklo_epi32(sb, sf);
+  __m128i uf = _mm_unpackhi_epi32(sb, sf);
 
-  kernel.packet[0]  = _mm_unpacklo_epi64(u0, u8);
-  kernel.packet[1]  = _mm_unpackhi_epi64(u0, u8);
-  kernel.packet[2]  = _mm_unpacklo_epi64(u1, u9);
-  kernel.packet[3]  = _mm_unpackhi_epi64(u1, u9);
-  kernel.packet[4]  = _mm_unpacklo_epi64(u2, ua);
-  kernel.packet[5]  = _mm_unpackhi_epi64(u2, ua);
-  kernel.packet[6]  = _mm_unpacklo_epi64(u3, ub);
-  kernel.packet[7]  = _mm_unpackhi_epi64(u3, ub);
-  kernel.packet[8]  = _mm_unpacklo_epi64(u4, uc);
-  kernel.packet[9]  = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
   kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
   kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
   kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
@@ -1490,7 +1992,9 @@
   kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
   __m128i false_mask = _mm_cmpeq_epi32(select, zero);
@@ -1500,11 +2004,14 @@
   return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
-                                    const Packet4ui& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
+                                     const Packet4ui& elsePacket) {
   return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
 }
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
   const __m128 zero = _mm_setzero_ps();
   const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
   __m128 false_mask = _mm_cmpeq_ps(select, zero);
@@ -1514,7 +2021,9 @@
   return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
   const __m128d zero = _mm_setzero_pd();
   const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
   __m128d false_mask = _mm_cmpeq_pd(select, zero);
@@ -1527,29 +2036,37 @@
 
 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
 #ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(a,b,c);
+template <>
+EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(a, b, c);
 }
-template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
-  return ::fma(a,b,c);
+template <>
+EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return ::fma(a, b, c);
 }
-template<> EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(a,b,-c);
+template <>
+EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
+  return ::fmaf(a, b, -c);
 }
-template<> EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
-  return ::fma(a,b,-c);
+template <>
+EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
+  return ::fma(a, b, -c);
 }
-template<> EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a,b,c);
+template <>
+EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
+  return ::fmaf(-a, b, c);
 }
-template<> EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
-  return ::fma(-a,b,c);
+template <>
+EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
+  return ::fma(-a, b, c);
 }
-template<> EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
-  return ::fmaf(-a,b,-c);
+template <>
+EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
+  return ::fmaf(-a, b, -c);
 }
-template<> EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
-  return ::fma(-a,b,-c);
+template <>
+EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
+  return ::fma(-a, b, -c);
 }
 #endif
 
@@ -1571,8 +2088,7 @@
   // Inf/NaN?
   __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
   // Inf/NaN adjust
-  __m128i naninf_adj =
-      _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
+  __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
   // extra exp adjust for  Inf/NaN
   ou = _mm_add_epi32(ou, naninf_adj);
 
@@ -1584,11 +2100,9 @@
   // magic.u = 113 << 23
   __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
   // o.f -= magic.f
-  ou = _mm_castps_si128(
-      _mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
+  ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
 
-  __m128i sign =
-      _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
+  __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
   // o.u |= (h.x & 0x8000) << 16;    // sign bit
   ou = _mm_or_si128(ou, sign);
   // return o.f;
@@ -1622,8 +2136,7 @@
   __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
 
   __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
-  __m128i subnorm_mask =
-      _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
+  __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
   //  f.f += denorm_magic.f;
   f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
   // f.u - denorm_magic.u
@@ -1656,7 +2169,7 @@
 
 // Packet math for Eigen::half
 // Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
 
 typedef struct {
@@ -1859,19 +2372,18 @@
 
 #endif
 
+}  // end namespace internal
 
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
 // PGI++ does not define the following intrinsics in C++ mode.
-static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
-static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
-static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
-static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
+static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
 #endif
 
-#endif // EIGEN_PACKET_MATH_SSE_H
+#endif  // EIGEN_PACKET_MATH_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 7e3099b..cbc6d47 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -18,23 +18,29 @@
 namespace internal {
 
 #ifndef EIGEN_VECTORIZE_AVX
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
 
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
 
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
 
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
 #endif
 
 template <>
-EIGEN_STRONG_INLINE Packet16b pcast<Packet4f, Packet16b>(const Packet4f& a,
-                                                         const Packet4f& b,
-                                                         const Packet4f& c,
+EIGEN_STRONG_INLINE Packet16b pcast<Packet4f, Packet16b>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
                                                          const Packet4f& d) {
   __m128 zero = pzero(a);
   __m128 nonzero_a = _mm_cmpneq_ps(a, zero);
@@ -50,79 +56,92 @@
 template <>
 EIGEN_STRONG_INLINE Packet4f pcast<Packet16b, Packet4f>(const Packet16b& a) {
   const __m128 cst_one = _mm_set_ps1(1.0f);
-  #ifdef EIGEN_VECTORIZE_SSE4_1
+#ifdef EIGEN_VECTORIZE_SSE4_1
   __m128i a_extended = _mm_cvtepi8_epi32(a);
   __m128i abcd = _mm_cmpeq_epi32(a_extended, _mm_setzero_si128());
-  #else
+#else
   __m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
   __m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
   __m128i abcd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
-  #endif
+#endif
   __m128 result = _mm_andnot_ps(_mm_castsi128_ps(abcd), cst_one);
   return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
   return _mm_cvttps_epi32(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
-  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)),
-                                         _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)), _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
                                          (1 << 2) | (1 << 6)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
   return _mm_cvtepi32_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
   return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
   // Simply discard the second half of the input
   return _mm_cvtepi32_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
   // Simply discard the second half of the input
   return _mm_cvtps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
   return _mm_castps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
   return _mm_castpd_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
   return _mm_castps_si128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
   return _mm_castsi128_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
   return _mm_castsi128_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
   return _mm_castpd_si128(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
   return Packet4ui(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
   return Packet4i(a);
 }
 // Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
 
 template <>
@@ -171,8 +190,8 @@
 
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_SSE_H
+#endif  // EIGEN_TYPE_CASTING_SSE_H
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 64b710f..6a03de9 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -13,10 +13,8 @@
 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"
 
-namespace Eigen
-{
-namespace internal
-{
+namespace Eigen {
+namespace internal {
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
@@ -75,174 +73,146 @@
 };
 
 template <>
-EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
-{
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr) {
   svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
-{
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from) {
   return svdup_n_s32(from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
-{
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a) {
   numext::int32_t c[packet_traits<numext::int32_t>::size];
   for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
   return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svadd_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svsub_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) {
   return svneg_s32_z(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) {
   return a;
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svmul_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svdiv_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
-{
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) {
   return svmla_s32_z(svptrue_b32(), c, a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svmin_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svmax_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/) {
   return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/) {
   return svdup_n_s32_z(svptrue_b32(), 0);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svand_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svorr_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return sveor_s32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b) {
   return svbic_s32_z(svptrue_b32(), a, b);
 }
 
 template <int N>
-EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) {
   return svasrd_n_s32_z(svptrue_b32(), a, N);
 }
 
 template <int N>
-EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) {
   return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), N));
 }
 
 template <int N>
-EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) {
   return svlsl_n_s32_z(svptrue_b32(), a, N);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from) {
   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
   return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from) {
   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
@@ -250,63 +220,54 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
-{
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
   EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
-{
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
   EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride) {
   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
   svint32_t indices = svindex_s32(0, stride);
   return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from,
+                                                                  Index stride) {
   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
   svint32_t indices = svindex_s32(0, stride);
   svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
 }
 
 template <>
-EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a) {
   // svlasta returns the first element if all predicate bits are 0
   return svlasta_s32(svpfalse_b(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) {
   return svrev_s32(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) {
   return svabs_s32_z(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a) {
   return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
 }
 
 template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
-{
-  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
-                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a) {
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
 
   // Multiply the vector by its reverse
   svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
@@ -338,14 +299,12 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a) {
   return svminv_s32(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a) {
   return svmaxv_s32(svptrue_b32(), a);
 }
 
@@ -422,120 +381,101 @@
 };
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
-{
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from) {
   return svdup_n_f32(from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
-{
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
-{
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
   float c[packet_traits<float>::size];
   for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
   return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svadd_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svsub_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) {
   return svneg_f32_z(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) {
   return a;
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svmul_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svdiv_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
-{
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) {
   return svmla_f32_z(svptrue_b32(), c, a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svmin_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
   return pmin<PacketXf>(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svminnm_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svmax_f32_z(svptrue_b32(), a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
   return pmax<PacketXf>(a, b);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svmaxnm_f32_z(svptrue_b32(), a, b);
 }
 
 // Float comparisons in SVE return svbool (predicate). Use svdup to set active
 // lanes to 1 (0xffffffffu) and inactive lanes to 0.
 template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
 }
 
@@ -543,71 +483,60 @@
 // greater/equal comparison (svcmpge_f32). Then fill a float vector with the
 // active elements.
 template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
   return svrintm_f32_z(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
   return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
 }
 
 // Logical Operations are not supported for float, so reinterpret casts
 template <>
-EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b) {
   return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
   return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
@@ -615,63 +544,54 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
-{
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from) {
   EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
 }
 
 template <>
-EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
-{
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from) {
   EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride) {
   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
   svint32_t indices = svindex_s32(0, stride);
   return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
 }
 
 template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride) {
   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
   svint32_t indices = svindex_s32(0, stride);
   svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
 }
 
 template <>
-EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a) {
   // svlasta returns the first element if all predicate bits are 0
   return svlasta_f32(svpfalse_b(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) {
   return svrev_f32(a);
 }
 
 template <>
-EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) {
   return svabs_f32_z(svptrue_b32(), a);
 }
 
-// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for 
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
 // all vector extensions and the generic version.
 template <>
-EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
-{
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
   return pfrexp_generic(a, exponent);
 }
 
 template <>
-EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
   return svaddv_f32(svptrue_b32(), a);
 }
 
@@ -679,10 +599,8 @@
 // mul
 // Only works for SVE Vls multiple of 128
 template <>
-EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
-{
-  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
-                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
   // Multiply the vector by its reverse
   svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
   svfloat32_t half_prod;
@@ -713,20 +631,17 @@
 }
 
 template <>
-EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a) {
   return svminv_f32(svptrue_b32(), a);
 }
 
 template <>
-EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a) {
   return svmaxv_f32(svptrue_b32(), a);
 }
 
-template<int N>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
-{
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel) {
   float buffer[packet_traits<float>::size * N] = {0};
   int i = 0;
 
@@ -741,9 +656,8 @@
   }
 }
 
-template<>
-EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
-{
+template <>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
   return pldexp_generic(a, exponent);
 }
 
diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
index 068ff48..b451676 100644
--- a/Eigen/src/Core/arch/SVE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -49,4 +49,4 @@
 }  // namespace internal
 }  // namespace Eigen
 
-#endif // EIGEN_TYPE_CASTING_SVE_H
+#endif  // EIGEN_TYPE_CASTING_SVE_H
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index 27d9a82..578e0f3 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -78,12 +78,11 @@
 };
 
 #ifdef SYCL_DEVICE_ONLY
-#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
-  template <>                                                              \
-  struct packet_traits<unpacket_type>                                      \
-      : sycl_packet_traits<has_blend, lengths> {                           \
-    typedef packet_type type;                                              \
-    typedef packet_type half;                                              \
+#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths)       \
+  template <>                                                                    \
+  struct packet_traits<unpacket_type> : sycl_packet_traits<has_blend, lengths> { \
+    typedef packet_type type;                                                    \
+    typedef packet_type half;                                                    \
   };
 
 SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
@@ -134,15 +133,13 @@
 #ifndef SYCL_DEVICE_ONLY
 template <typename PacketReturnType, int PacketSize>
 struct PacketWrapper {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
   EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
     eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
     abort();
   }
-  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in,
-                                                                   Scalar) {
+  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
     return ::Eigen::internal::template plset<PacketReturnType>(in);
   }
   EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
@@ -154,8 +151,7 @@
 #elif defined(SYCL_DEVICE_ONLY)
 template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 4> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
     switch (index) {
@@ -168,15 +164,14 @@
       case 3:
         return in.w();
       default:
-      //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. 
-      // The code will never reach here
-      __builtin_unreachable();
+        // INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here.
+        //  The code will never reach here
+        __builtin_unreachable();
     }
     __builtin_unreachable();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
-      Scalar in, Scalar other) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
     return PacketReturnType(in, other, other, other);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
@@ -186,25 +181,20 @@
 
 template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 1> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
     return in;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in,
-                                                                   Scalar) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
     return PacketReturnType(in);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
-    lhs = rhs[0];
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { lhs = rhs[0]; }
 };
 
 template <typename PacketReturnType>
 struct PacketWrapper<PacketReturnType, 2> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
     switch (index) {
@@ -213,15 +203,14 @@
       case 1:
         return in.y();
       default:
-        //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. 
-      // The code will never reach here
+        // INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here.
+        // The code will never reach here
         __builtin_unreachable();
     }
     __builtin_unreachable();
   }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
-      Scalar in, Scalar other) {
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
     return PacketReturnType(in, other);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index a8adc46..b20c32b 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -31,11 +31,10 @@
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(SYCL_DEVICE_ONLY)
-#define SYCL_PLOG(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::log(a);                                           \
+#define SYCL_PLOG(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>(const packet_type& a) { \
+    return cl::sycl::log(a);                                                                  \
   }
 
 SYCL_PLOG(cl::sycl::cl_half8)
@@ -43,11 +42,10 @@
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
 
-#define SYCL_PLOG1P(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::log1p(a);                                           \
+#define SYCL_PLOG1P(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>(const packet_type& a) { \
+    return cl::sycl::log1p(a);                                                                  \
   }
 
 SYCL_PLOG1P(cl::sycl::cl_half8)
@@ -55,11 +53,10 @@
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
 
-#define SYCL_PLOG10(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::log10(a);                                           \
+#define SYCL_PLOG10(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>(const packet_type& a) { \
+    return cl::sycl::log10(a);                                                                  \
   }
 
 SYCL_PLOG10(cl::sycl::cl_half8)
@@ -67,11 +64,10 @@
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
 
-#define SYCL_PEXP(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::exp(a);                                           \
+#define SYCL_PEXP(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>(const packet_type& a) { \
+    return cl::sycl::exp(a);                                                                  \
   }
 
 SYCL_PEXP(cl::sycl::cl_half8)
@@ -81,11 +77,10 @@
 SYCL_PEXP(cl::sycl::cl_double2)
 #undef SYCL_PEXP
 
-#define SYCL_PEXPM1(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::expm1(a);                                           \
+#define SYCL_PEXPM1(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>(const packet_type& a) { \
+    return cl::sycl::expm1(a);                                                                  \
   }
 
 SYCL_PEXPM1(cl::sycl::cl_half8)
@@ -93,11 +88,10 @@
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
 
-#define SYCL_PSQRT(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::sqrt(a);                                           \
+#define SYCL_PSQRT(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>(const packet_type& a) { \
+    return cl::sycl::sqrt(a);                                                                  \
   }
 
 SYCL_PSQRT(cl::sycl::cl_half8)
@@ -105,11 +99,10 @@
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
 
-#define SYCL_PRSQRT(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::rsqrt(a);                                           \
+#define SYCL_PRSQRT(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>(const packet_type& a) { \
+    return cl::sycl::rsqrt(a);                                                                  \
   }
 
 SYCL_PRSQRT(cl::sycl::cl_half8)
@@ -118,11 +111,10 @@
 #undef SYCL_PRSQRT
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSIN(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::sin(a);                                           \
+#define SYCL_PSIN(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>(const packet_type& a) { \
+    return cl::sycl::sin(a);                                                                  \
   }
 
 SYCL_PSIN(cl::sycl::cl_half8)
@@ -131,11 +123,10 @@
 #undef SYCL_PSIN
 
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOS(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::cos(a);                                           \
+#define SYCL_PCOS(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>(const packet_type& a) { \
+    return cl::sycl::cos(a);                                                                  \
   }
 
 SYCL_PCOS(cl::sycl::cl_half8)
@@ -144,11 +135,10 @@
 #undef SYCL_PCOS
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTAN(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::tan(a);                                           \
+#define SYCL_PTAN(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>(const packet_type& a) { \
+    return cl::sycl::tan(a);                                                                  \
   }
 
 SYCL_PTAN(cl::sycl::cl_half8)
@@ -157,11 +147,10 @@
 #undef SYCL_PTAN
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PASIN(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::asin(a);                                           \
+#define SYCL_PASIN(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>(const packet_type& a) { \
+    return cl::sycl::asin(a);                                                                  \
   }
 
 SYCL_PASIN(cl::sycl::cl_half8)
@@ -170,11 +159,10 @@
 #undef SYCL_PASIN
 
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PACOS(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::acos(a);                                           \
+#define SYCL_PACOS(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>(const packet_type& a) { \
+    return cl::sycl::acos(a);                                                                  \
   }
 
 SYCL_PACOS(cl::sycl::cl_half8)
@@ -183,11 +171,10 @@
 #undef SYCL_PACOS
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PATAN(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::atan(a);                                           \
+#define SYCL_PATAN(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>(const packet_type& a) { \
+    return cl::sycl::atan(a);                                                                  \
   }
 
 SYCL_PATAN(cl::sycl::cl_half8)
@@ -196,11 +183,10 @@
 #undef SYCL_PATAN
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSINH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::sinh(a);                                           \
+#define SYCL_PSINH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>(const packet_type& a) { \
+    return cl::sycl::sinh(a);                                                                  \
   }
 
 SYCL_PSINH(cl::sycl::cl_half8)
@@ -209,11 +195,10 @@
 #undef SYCL_PSINH
 
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOSH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::cosh(a);                                           \
+#define SYCL_PCOSH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>(const packet_type& a) { \
+    return cl::sycl::cosh(a);                                                                  \
   }
 
 SYCL_PCOSH(cl::sycl::cl_half8)
@@ -222,11 +207,10 @@
 #undef SYCL_PCOSH
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTANH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::tanh(a);                                           \
+#define SYCL_PTANH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>(const packet_type& a) { \
+    return cl::sycl::tanh(a);                                                                  \
   }
 
 SYCL_PTANH(cl::sycl::cl_half8)
@@ -234,11 +218,10 @@
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
 
-#define SYCL_PCEIL(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::ceil(a);                                           \
+#define SYCL_PCEIL(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>(const packet_type& a) { \
+    return cl::sycl::ceil(a);                                                                  \
   }
 
 SYCL_PCEIL(cl::sycl::cl_half)
@@ -246,11 +229,10 @@
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
 
-#define SYCL_PROUND(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::round(a);                                           \
+#define SYCL_PROUND(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>(const packet_type& a) { \
+    return cl::sycl::round(a);                                                                  \
   }
 
 SYCL_PROUND(cl::sycl::cl_half8)
@@ -258,11 +240,10 @@
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
 
-#define SYCL_PRINT(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::rint(a);                                           \
+#define SYCL_PRINT(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>(const packet_type& a) { \
+    return cl::sycl::rint(a);                                                                  \
   }
 
 SYCL_PRINT(cl::sycl::cl_half8)
@@ -270,11 +251,10 @@
 SYCL_PRINT(cl::sycl::cl_double2)
 #undef SYCL_PRINT
 
-#define SYCL_FLOOR(packet_type)                                          \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::floor(a);                                           \
+#define SYCL_FLOOR(packet_type)                                                                 \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>(const packet_type& a) { \
+    return cl::sycl::floor(a);                                                                  \
   }
 
 SYCL_FLOOR(cl::sycl::cl_half8)
@@ -282,11 +262,10 @@
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
 
-#define SYCL_PMIN(packet_type, expr)                                   \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>( \
-      const packet_type& a, const packet_type& b) {                    \
-    return expr;                                                       \
+#define SYCL_PMIN(packet_type, expr)                                                                                \
+  template <>                                                                                                       \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { \
+    return expr;                                                                                                    \
   }
 
 SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
@@ -294,11 +273,10 @@
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
 
-#define SYCL_PMAX(packet_type, expr)                                   \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>( \
-      const packet_type& a, const packet_type& b) {                    \
-    return expr;                                                       \
+#define SYCL_PMAX(packet_type, expr)                                                                                \
+  template <>                                                                                                       \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { \
+    return expr;                                                                                                    \
   }
 
 SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
@@ -306,13 +284,10 @@
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
 
-#define SYCL_PLDEXP(packet_type)                                             \
-  template <>                                                                \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(                  \
-      const packet_type& a, const packet_type& exponent) {                   \
-    return cl::sycl::ldexp(                                                  \
-        a, exponent.template convert<cl::sycl::cl_int,                       \
-                                     cl::sycl::rounding_mode::automatic>()); \
+#define SYCL_PLDEXP(packet_type)                                                                                  \
+  template <>                                                                                                     \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(const packet_type& a, const packet_type& exponent) {   \
+    return cl::sycl::ldexp(a, exponent.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>()); \
   }
 
 SYCL_PLDEXP(cl::sycl::cl_half8)
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 4b0b1c6..6b6bfe4 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -29,15 +29,16 @@
 
 namespace internal {
 #ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(packet_type, AlignedType)                          \
-  template <>                                                         \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                   \
-      pload##AlignedType<packet_type>(                                \
-          const typename unpacket_traits<packet_type>::type* from) {  \
-   auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(from);\
-    packet_type res{};                                                \
-    res.load(0, ptr);                                     \
-    return res;                                                       \
+#define SYCL_PLOAD(packet_type, AlignedType)                                                                           \
+  template <>                                                                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType<packet_type>(                                   \
+      const typename unpacket_traits<packet_type>::type* from) {                                                       \
+    auto ptr =                                                                                                         \
+        cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+            from);                                                                                                     \
+    packet_type res{};                                                                                                 \
+    res.load(0, ptr);                                                                                                  \
+    return res;                                                                                                        \
   }
 
 SYCL_PLOAD(cl::sycl::cl_float4, u)
@@ -47,37 +48,34 @@
 #undef SYCL_PLOAD
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
-    pload<cl::sycl::cl_half8>(
-        const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
-  auto ptr = cl::sycl::address_space_cast<
-      cl::sycl::access::address_space::generic_space,
-      cl::sycl::access::decorated::no>(
-      reinterpret_cast<const cl::sycl::cl_half*>(from));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pload<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<const cl::sycl::cl_half*>(from));
   cl::sycl::cl_half8 res{};
   res.load(0, ptr);
   return res;
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
-ploadu<cl::sycl::cl_half8>(
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 ploadu<cl::sycl::cl_half8>(
     const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
-  auto ptr = cl::sycl::address_space_cast<
-      cl::sycl::access::address_space::generic_space,
-      cl::sycl::access::decorated::no>(
-      reinterpret_cast<const cl::sycl::cl_half*>(from));
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<const cl::sycl::cl_half*>(from));
   cl::sycl::cl_half8 res{};
   res.load(0, ptr);
   return res;
 }
 
-#define SYCL_PSTORE(scalar, packet_type, alignment)             \
-  template <>                                                   \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
-      scalar* to, const packet_type& from) {                    \
-    auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(to);\
-    from.store(0, ptr);                               \
+#define SYCL_PSTORE(scalar, packet_type, alignment)                                                                    \
+  template <>                                                                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(scalar* to, const packet_type& from) {                  \
+    auto ptr =                                                                                                         \
+        cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+            to);                                                                                                       \
+    from.store(0, ptr);                                                                                                \
   }
 
 SYCL_PSTORE(float, cl::sycl::cl_float4, )
@@ -87,22 +85,18 @@
 #undef SYCL_PSTORE
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
-    Eigen::half* to, const cl::sycl::cl_half8& from) {
-  auto ptr = cl::sycl::address_space_cast<
-      cl::sycl::access::address_space::generic_space,
-      cl::sycl::access::decorated::no>(
-      reinterpret_cast<cl::sycl::cl_half*>(to));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<cl::sycl::cl_half*>(to));
   from.store(0, ptr);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
-    Eigen::half* to, const cl::sycl::cl_half8& from) {
-  auto ptr = cl::sycl::address_space_cast<
-      cl::sycl::access::address_space::generic_space,
-      cl::sycl::access::decorated::no>(
-      reinterpret_cast<cl::sycl::cl_half*>(to));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<cl::sycl::cl_half*>(to));
   from.store(0, ptr);
 }
 
@@ -123,44 +117,33 @@
 template <typename packet_type>
 struct get_base_packet {
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
-  get_ploaddup(sycl_multi_pointer) {}
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer) {}
 
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
-  get_pgather(sycl_multi_pointer, Index) {}
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer, Index) {}
 };
 
 template <>
 struct get_base_packet<cl::sycl::cl_half8> {
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
-      sycl_multi_pointer from) {
-    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
-                              static_cast<cl::sycl::half>(from[0]),
-                              static_cast<cl::sycl::half>(from[1]),
-                              static_cast<cl::sycl::half>(from[1]),
-                              static_cast<cl::sycl::half>(from[2]),
-                              static_cast<cl::sycl::half>(from[2]),
-                              static_cast<cl::sycl::half>(from[3]),
-                              static_cast<cl::sycl::half>(from[3]));
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(sycl_multi_pointer from) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]), static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[1]), static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[2]), static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[3]), static_cast<cl::sycl::half>(from[3]));
   }
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
-      sycl_multi_pointer from, Index stride) {
-    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
-                              static_cast<cl::sycl::half>(from[1 * stride]),
-                              static_cast<cl::sycl::half>(from[2 * stride]),
-                              static_cast<cl::sycl::half>(from[3 * stride]),
-                              static_cast<cl::sycl::half>(from[4 * stride]),
-                              static_cast<cl::sycl::half>(from[5 * stride]),
-                              static_cast<cl::sycl::half>(from[6 * stride]),
-                              static_cast<cl::sycl::half>(from[7 * stride]));
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_half8(
+        static_cast<cl::sycl::half>(from[0 * stride]), static_cast<cl::sycl::half>(from[1 * stride]),
+        static_cast<cl::sycl::half>(from[2 * stride]), static_cast<cl::sycl::half>(from[3 * stride]),
+        static_cast<cl::sycl::half>(from[4 * stride]), static_cast<cl::sycl::half>(from[5 * stride]),
+        static_cast<cl::sycl::half>(from[6 * stride]), static_cast<cl::sycl::half>(from[7 * stride]));
   }
 
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
-      sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_half8& from,
+                                                                 Index stride) {
     auto tmp = stride;
     to[0] = Eigen::half(from.s0());
     to[tmp] = Eigen::half(from.s1());
@@ -171,45 +154,36 @@
     to[tmp += stride] = Eigen::half(from.s6());
     to[tmp += stride] = Eigen::half(from.s7());
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
-      const cl::sycl::half& a) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(const cl::sycl::half& a) {
     return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
-                              static_cast<cl::sycl::half>(a + 2),
-                              static_cast<cl::sycl::half>(a + 3),
-                              static_cast<cl::sycl::half>(a + 4),
-                              static_cast<cl::sycl::half>(a + 5),
-                              static_cast<cl::sycl::half>(a + 6),
-                              static_cast<cl::sycl::half>(a + 7));
+                              static_cast<cl::sycl::half>(a + 2), static_cast<cl::sycl::half>(a + 3),
+                              static_cast<cl::sycl::half>(a + 4), static_cast<cl::sycl::half>(a + 5),
+                              static_cast<cl::sycl::half>(a + 6), static_cast<cl::sycl::half>(a + 7));
   }
 };
 
 template <>
 struct get_base_packet<cl::sycl::cl_float4> {
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
-      sycl_multi_pointer from) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
     return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
   }
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(
-      sycl_multi_pointer from, Index stride) {
-    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride],
-                               from[2 * stride], from[3 * stride]);
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
   }
 
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
-      sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_float4& from,
+                                                                 Index stride) {
     auto tmp = stride;
     to[0] = from.x();
     to[tmp] = from.y();
     to[tmp += stride] = from.z();
     to[tmp += stride] = from.w();
   }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(
-      const float& a) {
-    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1),
-                               static_cast<float>(a + 2),
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
+    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1), static_cast<float>(a + 2),
                                static_cast<float>(a + 3));
   }
 };
@@ -217,28 +191,25 @@
 template <>
 struct get_base_packet<cl::sycl::cl_double2> {
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2
-  get_ploaddup(const sycl_multi_pointer from) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
     return cl::sycl::cl_double2(from[0], from[0]);
   }
 
   template <typename sycl_multi_pointer, typename Index>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(
-      const sycl_multi_pointer from, Index stride) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from,
+                                                                                Index stride) {
     return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
   }
 
   template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
-      sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to,
+                                                                 const cl::sycl::cl_double2& from, Index stride) {
     to[0] = from.x();
     to[stride] = from.y();
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(
-      const double& a) {
-    return cl::sycl::cl_double2(static_cast<double>(a),
-                                static_cast<double>(a + 1));
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
+    return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
   }
 };
 
@@ -268,15 +239,14 @@
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
     const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
-  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
+  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half&)a);
 }
 
-#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
-  pgather<scalar, packet_type>(                                                \
-      const typename unpacket_traits<packet_type>::type* from, Index stride) { \
-    return get_base_packet<packet_type>::get_pgather(from, stride);            \
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                               \
+  template <>                                                                     \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pgather<scalar, packet_type>( \
+      const typename unpacket_traits<packet_type>::type* from, Index stride) {    \
+    return get_base_packet<packet_type>::get_pgather(from, stride);               \
   }
 
 SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
@@ -284,12 +254,11 @@
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PGATHER_SPECILIZE
 
-#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
-      typename unpacket_traits<packet_type>::type * to,                     \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                                             \
+  template <>                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>(                      \
+      typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride) { \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);                                \
   }
 
 SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
@@ -298,11 +267,11 @@
 
 #undef SYCL_PSCATTER_SPECILIZE
 
-#define SYCL_PMAD(packet_type)                                            \
-  template <>                                                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(                \
-      const packet_type& a, const packet_type& b, const packet_type& c) { \
-    return cl::sycl::mad(a, b, c);                                        \
+#define SYCL_PMAD(packet_type)                                                                        \
+  template <>                                                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(const packet_type& a, const packet_type& b, \
+                                                          const packet_type& c) {                     \
+    return cl::sycl::mad(a, b, c);                                                                    \
   }
 
 SYCL_PMAD(cl::sycl::cl_half8)
@@ -311,146 +280,109 @@
 #undef SYCL_PMAD
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
-    const cl::sycl::cl_half8& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
   return Eigen::half(a.s0());
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
   return a.x();
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return a.x();
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
-    const cl::sycl::cl_half8& a) {
-  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
-                     + a.s6() + a.s7());
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5() + a.s6() + a.s7());
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
   return a.x() + a.y() + a.z() + a.w();
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return a.x() + a.y();
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
-    const cl::sycl::cl_half8& a) {
-  return Eigen::half(cl::sycl::fmax(
-          cl::sycl::fmax(
-            cl::sycl::fmax(a.s0(), a.s1()),
-            cl::sycl::fmax(a.s2(), a.s3())),
-          cl::sycl::fmax(
-            cl::sycl::fmax(a.s4(), a.s5()),
-            cl::sycl::fmax(a.s6(), a.s7()))));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmax(cl::sycl::fmax(cl::sycl::fmax(a.s0(), a.s1()), cl::sycl::fmax(a.s2(), a.s3())),
+                                    cl::sycl::fmax(cl::sycl::fmax(a.s4(), a.s5()), cl::sycl::fmax(a.s6(), a.s7()))));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
-                        cl::sycl::fmax(a.z(), a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return cl::sycl::fmax(a.x(), a.y());
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
-    const cl::sycl::cl_half8& a) {
-  return Eigen::half(cl::sycl::fmin(
-      cl::sycl::fmin(
-          cl::sycl::fmin(a.s0(), a.s1()),
-          cl::sycl::fmin(a.s2(), a.s3())),
-      cl::sycl::fmin(
-          cl::sycl::fmin(a.s4(), a.s5()),
-          cl::sycl::fmin(a.s6(), a.s7()))));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmin(cl::sycl::fmin(cl::sycl::fmin(a.s0(), a.s1()), cl::sycl::fmin(a.s2(), a.s3())),
+                                    cl::sycl::fmin(cl::sycl::fmin(a.s4(), a.s5()), cl::sycl::fmin(a.s6(), a.s7()))));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
-                        cl::sycl::fmin(a.z(), a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return cl::sycl::fmin(a.x(), a.y());
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
-    const cl::sycl::cl_half8& a) {
-  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
-                     a.s6() * a.s7());
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() * a.s6() * a.s7());
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
   return a.x() * a.y() * a.z() * a.w();
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return a.x() * a.y();
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
-pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
-  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
-                            cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
-                            cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()), cl::sycl::fabs(a.s2()),
+                            cl::sycl::fabs(a.s3()), cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
                             cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
-  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
-                             cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()),
+                             cl::sycl::fabs(a.w()));
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
   return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
 }
 
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a,
-                                                          const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet& a, const Packet& b) {
   return (a <= b).template as<Packet>();
 }
 
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a,
-                                                          const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet& a, const Packet& b) {
   return (a < b).template as<Packet>();
 }
 
 template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
-                                                          const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet& a, const Packet& b) {
   return (a == b).template as<Packet>();
 }
 
-#define SYCL_PCMP(OP, TYPE)                                                    \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE &a,    \
-                                                             const TYPE &b) {  \
-    return sycl_pcmp_##OP<TYPE>(a, b);                                         \
+#define SYCL_PCMP(OP, TYPE)                                                                  \
+  template <>                                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE& a, const TYPE& b) { \
+    return sycl_pcmp_##OP<TYPE>(a, b);                                                       \
   }
 
 SYCL_PCMP(le, cl::sycl::cl_half8)
@@ -464,8 +396,7 @@
 SYCL_PCMP(eq, cl::sycl::cl_double2)
 #undef SYCL_PCMP
 
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
-    PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
   cl::sycl::cl_half tmp = kernel.packet[0].s1();
   kernel.packet[0].s1() = kernel.packet[1].s0();
   kernel.packet[1].s0() = tmp;
@@ -579,8 +510,7 @@
   kernel.packet[7].s6() = tmp;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
-    PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
   float tmp = kernel.packet[0].y();
   kernel.packet[0].y() = kernel.packet[1].x();
   kernel.packet[1].x() = tmp;
@@ -606,8 +536,7 @@
   kernel.packet[3].z() = tmp;
 }
 
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
-    PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
   double tmp = kernel.packet[0].y();
   kernel.packet[0].y() = kernel.packet[1].x();
   kernel.packet[1].x() = tmp;
@@ -615,35 +544,27 @@
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
-    const cl::sycl::cl_half8& thenPacket,
+    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket, const cl::sycl::cl_half8& thenPacket,
     const cl::sycl::cl_half8& elsePacket) {
-  cl::sycl::cl_short8 condition(
-      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
-      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
-      ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
-      ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+  cl::sycl::cl_short8 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+                                ifPacket.select[3] ? 0 : -1, ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+                                ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
   return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
-    const cl::sycl::cl_float4& thenPacket,
+    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket, const cl::sycl::cl_float4& thenPacket,
     const cl::sycl::cl_float4& elsePacket) {
-  cl::sycl::cl_int4 condition(
-      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
-      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1);
+  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+                              ifPacket.select[3] ? 0 : -1);
   return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 
 template <>
-inline cl::sycl::cl_double2 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
-    const cl::sycl::cl_double2& thenPacket,
-    const cl::sycl::cl_double2& elsePacket) {
-  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
-                               ifPacket.select[1] ? 0 : -1);
+inline cl::sycl::cl_double2 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+                                   const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
+  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1);
   return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 #endif  // SYCL_DEVICE_ONLY
diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
index 9f193c1..6e3fa4f 100644
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -34,10 +34,9 @@
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4
-pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
-  return a
-      .template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(
+    const cl::sycl::cl_float4& a) {
+  return a.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
 }
 
 template <>
@@ -46,10 +45,9 @@
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
-  return a.template convert<cl::sycl::cl_float,
-                            cl::sycl::rounding_mode::automatic>();
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(
+    const cl::sycl::cl_int4& a) {
+  return a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
 }
 
 template <>
@@ -58,13 +56,10 @@
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
     const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
-  auto a1 = a.template convert<cl::sycl::cl_float,
-                               cl::sycl::rounding_mode::automatic>();
-  auto b1 = b.template convert<cl::sycl::cl_float,
-                               cl::sycl::rounding_mode::automatic>();
+  auto a1 = a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+  auto b1 = b.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
   return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
 }
 
@@ -74,8 +69,8 @@
 };
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(
+    const cl::sycl::cl_float4& a) {
   // Simply discard the second half of the input
   return cl::sycl::cl_double2(a.x(), a.y());
 }
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 4d74d3d..4000e05 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -19,21 +19,22 @@
 namespace internal {
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+static Packet4ui p4ui_CONJ_XOR = {0x00000000, 0x80000000, 0x00000000,
+                                  0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
 #endif
 
-static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+    (Packet2ul)vec_sld((Packet4ui)p2d_ZERO_, (Packet4ui)p2l_ZERO, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+    (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_ZERO_, 8);  //{ 0x8000000000000000, 0x0000000000000000 };
 
-struct Packet1cd
-{
+struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
   Packet2d v;
 };
 
-struct Packet2cf
-{
+struct Packet2cf {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
@@ -46,8 +47,8 @@
 #endif
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
   typedef Packet2cf half;
   enum {
@@ -55,23 +56,22 @@
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasBlend  = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasBlend = 1,
     HasSetLinear = 0
   };
 };
 
-
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet1cd type;
   typedef Packet1cd half;
   enum {
@@ -79,58 +79,101 @@
     AlignedOnScalar = 1,
     size = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> {
-  typedef std::complex<float>  type;
-  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet2cf half;
   typedef Packet4f as_real;
 };
-template<> struct unpacket_traits<Packet1cd> {
+template <>
+struct unpacket_traits<Packet1cd> {
   typedef std::complex<double> type;
-  enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
   typedef Packet1cd half;
   typedef Packet2d as_real;
 };
 
 /* Forward declaration */
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel);
 
 /* complex<double> first */
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
 
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride EIGEN_UNUSED) {
   return pload<Packet1cd>(from);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride EIGEN_UNUSED) {
   pstore<std::complex<double> >(to, from);
 }
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   Packet2d a_re, a_im, v1, v2;
 
   // Permute and multiply the real parts of a and b
@@ -141,219 +184,285 @@
   v1 = vec_madd(a_re, b.v, p2d_ZERO);
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(a_im, b.v, p2d_ZERO);
-  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
-  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+  v2 = (Packet2d)vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d)vec_xor((Packet2d)v2, (Packet2d)p2ul_CONJ_XOR1);
 
   return Packet1cd(v1 + v2);
 }
-template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
-  Packet2d eq = vec_cmpeq (a.v, b.v);
-  Packet2d tmp = { eq[1], eq[0] };
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_and(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_or(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_xor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_and(a.v, vec_nor(b.v, b.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = vec_cmpeq(a.v, b.v);
+  Packet2d tmp = {eq[1], eq[0]};
   return (Packet1cd)pand<Packet2d>(eq, tmp);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
 
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
   return pfirst(a);
 }
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
   return pfirst(a);
 }
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
   return pdiv_complex(a, b);
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
   return Packet1cd(preverse(Packet2d(x.v)));
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
   Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
 
 /* complex<float> follows */
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
   EIGEN_ALIGN16 std::complex<float> res[2];
   pstore<std::complex<float> >(res, a);
 
   return res[0];
 }
 
-
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   Packet2cf res;
-  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[0] = Packet1cd(vec_ld2f((const float*)&from));
   res.cd[1] = res.cd[0];
   return res;
 }
 #else
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   Packet2cf res;
-  if((std::ptrdiff_t(&from) % 16) == 0)
-    res.v = pload<Packet4f>((const float *)&from);
+  if ((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float*)&from);
   else
-    res.v = ploadu<Packet4f>((const float *)&from);
+    res.v = ploadu<Packet4f>((const float*)&from);
   res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
   return res;
 }
 #endif
 
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
   EIGEN_ALIGN16 std::complex<float> af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
   return pload<Packet2cf>(af);
 }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
   EIGEN_ALIGN16 std::complex<float> af[2];
-  pstore<std::complex<float> >((std::complex<float> *) af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  pstore<std::complex<float> >((std::complex<float>*)af, from);
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate(Packet4f(a.v)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
-  Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
   Packet2cf res;
-  Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
-  Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+  Packet2d tmp1 = {eq.v4f[0][1], eq.v4f[0][0]};
+  Packet2d tmp2 = {eq.v4f[1][1], eq.v4f[1][0]};
   res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
   res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
   Packet2cf res;
   res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
   res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   Packet2cf res;
-  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
-  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  res.v.v4f[0] =
+      pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] =
+      pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
   Packet2cf res;
   res.cd[0] = a.cd[1];
   res.cd[1] = a.cd[0];
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
   std::complex<float> res;
   Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
   vec_st2f(b.v, (float*)&res);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   std::complex<float> res;
   Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
   vec_st2f(b.v, (float*)&res);
   return res;
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   return pdiv_complex(a, b);
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /*<Packet2cf>*/ (const Packet2cf& x) {
   Packet2cf res;
   res.cd[0] = pcplxflip(x.cd[0]);
   res.cd[1] = pcplxflip(x.cd[1]);
   return res;
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   Packet1cd tmp = kernel.packet[0].cd[1];
   kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
   kernel.packet[1].cd[0] = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
   Packet2cf result;
-  const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+  const Selector<4> ifPacket4 = {ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1]};
   result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
   return result;
 }
 #else
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
-  Packet4f eq = vec_cmpeq (a.v, b.v);
-  Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = vec_cmpeq(a.v, b.v);
+  Packet4f tmp = {eq[1], eq[0], eq[3], eq[2]};
   return (Packet2cf)pand<Packet4f>(eq, tmp);
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   Packet4f a_re, a_im, prod, prod_im;
 
   // Permute and multiply the real parts of a and b
   a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  
+
   // Get the imaginary parts of a
   a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
 
@@ -365,27 +474,27 @@
 
   // multiply a_re * b, add prod_im
   prod = pmadd<Packet4f>(a_re, b.v, prod_im);
- 
+
   return Packet2cf(prod);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
   Packet4f rev_a;
   rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
   return Packet2cf(rev_a);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
   Packet4f b;
   b = vec_sld(a.v, a.v, 8);
   b = padd<Packet4f>(a.v, b);
   return pfirst<Packet2cf>(Packet2cf(b));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   Packet4f b;
   Packet2cf prod;
   b = vec_sld(a.v, a.v, 8);
@@ -394,34 +503,36 @@
   return pfirst<Packet2cf>(prod);
 }
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   return pdiv_complex(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
   return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
 }
 
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
   Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
   Packet2cf result;
-  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  result.v = reinterpret_cast<Packet4f>(
+      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
   return result;
 }
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX32_ZVECTOR_H
+#endif  // EIGEN_COMPLEX32_ZVECTOR_H
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 1b43878..5c55350 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -24,7 +24,7 @@
 namespace internal {
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
 static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
 static EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
 static EIGEN_DECLARE_CONST_Packet4i(23, 23);
@@ -32,27 +32,27 @@
 static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
 
 /* the smallest non denormalized float number */
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-  
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);  // -1.f/0.f
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
+
 /* natural logarithm computed for 4 simultaneous float
   return NaN for x <= 0
 */
 static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
 static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
 static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
 static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
 static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
 
-static EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
 static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
 
 static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
@@ -67,11 +67,11 @@
 static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 #endif
 
-static EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-static EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+static EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
 static EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
 
-static EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
 static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
 
 static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
@@ -88,9 +88,8 @@
 static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
 static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
   Packet2d x = _x;
 
   Packet2d tmp, fx;
@@ -108,40 +107,38 @@
   x = psub(x, tmp);
   x = psub(x, z);
 
-  Packet2d x2 = pmul(x,x);
+  Packet2d x2 = pmul(x, x);
 
   Packet2d px = p2d_cephes_exp_p0;
   px = pmadd(px, x2, p2d_cephes_exp_p1);
   px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
+  px = pmul(px, x);
 
   Packet2d qx = p2d_cephes_exp_q0;
   qx = pmadd(qx, x2, p2d_cephes_exp_q1);
   qx = pmadd(qx, x2, p2d_cephes_exp_q2);
   qx = pmadd(qx, x2, p2d_cephes_exp_q3);
 
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
 
   // build 2^n
   emm0 = vec_ctsl(fx, 0);
 
-  static const Packet2l p2l_1023 = { 1023, 1023 };
-  static const Packet2ul p2ul_52 = { 52, 52 };
+  static const Packet2l p2l_1023 = {1023, 1023};
+  static const Packet2ul p2ul_52 = {52, 52};
 
   emm0 = emm0 + p2l_1023;
   emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
 
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in
   // inputs and return them unmodified.
   Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
-                 isnumber_mask);
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), isnumber_mask);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
   Packet4f x = _x;
 
@@ -161,7 +158,7 @@
   x = psub(x, tmp);
   x = psub(x, z);
 
-  z = pmul(x,x);
+  z = pmul(x, x);
 
   Packet4f y = p4f_cephes_exp_p0;
   y = pmadd(y, x, p4f_cephes_exp_p1);
@@ -173,7 +170,7 @@
   y = padd(y, p4f_1);
 
   // build 2^n
-  emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
+  emm0 = (Packet4i){(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
   emm0 = emm0 + p4i_0x7f;
   emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
 
@@ -186,15 +183,13 @@
 #endif
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
   return vec_sqrt(x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
   Packet4f res;
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
   res = vec_sqrt(x);
@@ -205,13 +200,13 @@
   return res;
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
   return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   Packet4f res;
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
   res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
@@ -224,8 +219,7 @@
 
 // Hyperbolic Tangent function.
 template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-ptanh<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x) {
   return internal::generic_fast_tanh_float(x);
 }
 
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 07de778..8ac8f77 100644
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -26,135 +26,136 @@
 #endif
 
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-typedef __vector int                 Packet4i;
-typedef __vector unsigned int        Packet4ui;
-typedef __vector __bool int          Packet4bi;
-typedef __vector short int           Packet8i;
-typedef __vector unsigned char       Packet16uc;
-typedef __vector double              Packet2d;
-typedef __vector unsigned long long  Packet2ul;
-typedef __vector long long           Packet2l;
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8i;
+typedef __vector unsigned char Packet16uc;
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
 
 // Z14 has builtin support for float vectors
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-typedef __vector float               Packet4f;
+typedef __vector float Packet4f;
 #else
 typedef struct {
-	Packet2d  v4f[2];
+  Packet2d v4f[2];
 } Packet4f;
 #endif
 
 typedef union {
-  numext::int32_t   i[4];
+  numext::int32_t i[4];
   numext::uint32_t ui[4];
-  numext::int64_t   l[2];
+  numext::int64_t l[2];
   numext::uint64_t ul[2];
-  double    d[2];
-  float     f[4];
-  Packet4i  v4i;
+  double d[2];
+  float f[4];
+  Packet4i v4i;
   Packet4ui v4ui;
-  Packet2l  v2l;
+  Packet2l v2l;
   Packet2ul v2ul;
-  Packet2d  v2d;
+  Packet2d v2d;
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-  Packet4f  v4f;
+  Packet4f v4f;
 #endif
 } Packet;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
 
-#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
 
-#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
-  Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
 
-#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
-  Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
 
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
-  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
-#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
-  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
 
 // These constants are endian-agnostic
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);  //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);   //{ 1, 1, 1, 1}
 
 static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
 static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
 static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 
-static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
-                              numext::bit_cast<double>(0x8000000000000000ull) };
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
+                             numext::bit_cast<double>(0x8000000000000000ull)};
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
 
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
   const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
 
-static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);     //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);  //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
 #endif
 
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
+    vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
 
-static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
 
 // Mask alignment
-#define EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
 
-#define EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
+#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
 
 // Handle endianness properly while loading constants
 // Define global static constants:
 
-static Packet16uc p16uc_FORWARD =   { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
 
-static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+                                               8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
+8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 
-static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
-static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
-static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
+(Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
+16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
+8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
 
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV =
+    vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);  //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
-static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-
+static Packet16uc p16uc_COMPLEX32_REV2 =
+    vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);  //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
-  #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #else
-  #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( "   pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm("   pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
 #endif
 
-template<> struct packet_traits<int>    : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
@@ -162,10 +163,10 @@
     AlignedOnScalar = 1,
     size = 4,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasBlend = 1
   };
 };
@@ -202,26 +203,26 @@
   };
 };
 
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
   typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=2,
+    size = 2,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 1,
-    HasMin  = 1,
-    HasMax  = 1,
-    HasAbs  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasRound = 1,
@@ -232,47 +233,75 @@
   };
 };
 
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4i half;
+};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4f half;
+};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2d half;
+};
 
 /* Forward declaration */
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
- 
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
+
+inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
   Packet vt;
   vt.v4i = v;
   s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
   Packet vt;
   vt.v4ui = v;
   s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
   Packet vt;
   vt.v2l = v;
   s << vt.l[0] << ", " << vt.l[1];
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
   Packet vt;
   vt.v2ul = v;
-  s << vt.ul[0] << ", " << vt.ul[1] ;
+  s << vt.ul[0] << ", " << vt.ul[1];
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
   Packet vt;
   vt.v2d = v;
   s << vt.d[0] << ", " << vt.d[1];
@@ -280,8 +309,7 @@
 }
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
   Packet vt;
   vt.v4f = v;
   s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
@@ -289,54 +317,53 @@
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
+  Packet* vfrom;
+  vfrom = (Packet*)from;
   return vfrom->v4i;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
+  Packet* vfrom;
+  vfrom = (Packet*)from;
   return vfrom->v2d;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
+  Packet* vto;
+  vto = (Packet*)to;
   vto->v4i = from;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
+  Packet* vto;
+  vto = (Packet*)to;
   vto->v2d = from;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
   return vec_splats(from);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
   return vec_splats(from);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
-                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
   a3 = pload<Packet4i>(a);
   a0 = vec_splat(a3, 0);
   a1 = vec_splat(a3, 1);
@@ -344,187 +371,316 @@
   a3 = vec_splat(a3, 3);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
-                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
   a1 = pload<Packet2d>(a);
   a0 = vec_splat(a1, 0);
   a1 = vec_splat(a1, 1);
-  a3 = pload<Packet2d>(a+2);
+  a3 = pload<Packet2d>(a + 2);
   a2 = vec_splat(a3, 0);
   a3 = vec_splat(a3, 1);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
   EIGEN_ALIGN16 int ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+  ai[0] = from[0 * stride];
+  ai[1] = from[1 * stride];
+  ai[2] = from[2 * stride];
+  ai[3] = from[3 * stride];
+  return pload<Packet4i>(ai);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
   EIGEN_ALIGN16 double af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
- return pload<Packet2d>(af);
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
+  return pload<Packet2d>(af);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
   EIGEN_ALIGN16 int ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  pstore<int>((int*)ai, from);
+  to[0 * stride] = ai[0];
+  to[1 * stride] = ai[1];
+  to[2 * stride] = ai[2];
+  to[3 * stride] = ai[3];
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
   EIGEN_ALIGN16 double af[2];
   pstore<double>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a + b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a - b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a * b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a / b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  return (-a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return padd<Packet4i>(pmul<Packet4i>(a, b), c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_madd(a, b, c);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_min(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_max(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_or(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_xor(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return pand<Packet4i>(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, vec_nor(b, b));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  return vec_round(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vec_floor(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
+  return pload<Packet4i>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  return pload<Packet2d>(from);
+}
 
-
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
   Packet4i p = pload<Packet4i>(from);
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
   Packet2d p = pload<Packet2d>(from);
   return vec_perm(p, p, p16uc_PSET64_HI);
 }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int    x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
-  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+  pstore<int>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  pstore<double>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{
-  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int x[4];
+  pstore(x, a);
+  return x[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_ALIGN16 double x[2];
+  pstore(x, a);
+  return x[0];
+}
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return reinterpret_cast<Packet4i>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return reinterpret_cast<Packet2d>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
+  return vec_abs(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
   Packet4i b, sum;
-  b   = vec_sld(a, a, 8);
+  b = vec_sld(a, a, 8);
   sum = padd<Packet4i>(a, b);
-  b   = vec_sld(sum, sum, 4);
+  b = vec_sld(sum, sum, 4);
   sum = padd<Packet4i>(sum, b);
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
   Packet2d b, sum;
-  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
   sum = padd<Packet2d>(a, b);
   return pfirst(sum);
 }
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
 // min
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
   Packet4i b, res;
-  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  b = pmin<Packet4i>(a, vec_sld(a, a, 8));
   res = pmin<Packet4i>(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(pmin<Packet2d>(
+      a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
 // max
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
   Packet4i b, res;
   b = pmax<Packet4i>(a, vec_sld(a, a, 8));
   res = pmax<Packet4i>(b, vec_sld(b, b, 4));
@@ -532,13 +688,13 @@
 }
 
 // max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(pmax<Packet2d>(
+      a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
   Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -549,23 +705,25 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
   Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
   kernel.packet[0] = t0;
   kernel.packet[1] = t1;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
   Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
   Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
@@ -576,32 +734,32 @@
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
 /* Helper function to simulate a vec_splat_packet4f
  */
-template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
-{
+template <int element>
+EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
   Packet4f splat;
   switch (element) {
-  case 0:
-    splat.v4f[0] = vec_splat(from.v4f[0], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 1:
-    splat.v4f[0] = vec_splat(from.v4f[0], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 2:
-    splat.v4f[0] = vec_splat(from.v4f[1], 0);
-    splat.v4f[1] = splat.v4f[0];
-    break;
-  case 3:
-    splat.v4f[0] = vec_splat(from.v4f[1], 1);
-    splat.v4f[1] = splat.v4f[0];
-    break;
+    case 0:
+      splat.v4f[0] = vec_splat(from.v4f[0], 0);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 1:
+      splat.v4f[0] = vec_splat(from.v4f[0], 1);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 2:
+      splat.v4f[0] = vec_splat(from.v4f[1], 0);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 3:
+      splat.v4f[0] = vec_splat(from.v4f[1], 1);
+      splat.v4f[1] = splat.v4f[0];
+      break;
   }
   return splat;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
   Packet4f vfrom;
@@ -610,26 +768,24 @@
   return vfrom;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
   vec_st2f(from.v4f[0], &to[0]);
   vec_st2f(from.v4f[1], &to[2]);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
   Packet4f to;
   to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
   to.v4f[1] = to.v4f[0];
   return to;
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
   a3 = pload<Packet4f>(a);
   a0 = vec_splat_packet4f<0>(a3);
   a1 = vec_splat_packet4f<1>(a3);
@@ -637,207 +793,213 @@
   a3 = vec_splat_packet4f<3>(a3);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
   EIGEN_ALIGN16 float ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4f>(ai);
+  ai[0] = from[0 * stride];
+  ai[1] = from[1 * stride];
+  ai[2] = from[2 * stride];
+  ai[3] = from[3 * stride];
+  return pload<Packet4f>(ai);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
   EIGEN_ALIGN16 float ai[4];
-  pstore<float>((float *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
+  pstore<float>((float*)ai, from);
+  to[0 * stride] = ai[0];
+  to[1 * stride] = ai[1];
+  to[2 * stride] = ai[2];
+  to[3 * stride] = ai[3];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f c;
   c.v4f[0] = a.v4f[0] + b.v4f[0];
   c.v4f[1] = a.v4f[1] + b.v4f[1];
   return c;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f c;
   c.v4f[0] = a.v4f[0] - b.v4f[0];
   c.v4f[1] = a.v4f[1] - b.v4f[1];
   return c;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f c;
   c.v4f[0] = a.v4f[0] * b.v4f[0];
   c.v4f[1] = a.v4f[1] * b.v4f[1];
   return c;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f c;
   c.v4f[0] = a.v4f[0] / b.v4f[0];
   c.v4f[1] = a.v4f[1] / b.v4f[1];
   return c;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
   Packet4f c;
   c.v4f[0] = -a.v4f[0];
   c.v4f[1] = -a.v4f[1];
   return c;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
   Packet4f res;
   res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
   res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = por(a.v4f[0], b.v4f[0]);
   res.v4f[1] = por(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
   Packet4f res;
   res.v4f[0] = vec_round(a.v4f[0]);
   res.v4f[1] = vec_round(a.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
   Packet4f res;
   res.v4f[0] = vec_ceil(a.v4f[0]);
   res.v4f[1] = vec_ceil(a.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
   Packet4f res;
   res.v4f[0] = vec_floor(a.v4f[0]);
   res.v4f[1] = vec_floor(a.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
   Packet4f p = pload<Packet4f>(from);
   p.v4f[1] = vec_splat(p.v4f[0], 1);
   p.v4f[0] = vec_splat(p.v4f[0], 0);
   return p;
 }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x[2];
+  vec_st2f(a.v4f[0], &x[0]);
+  return x[0];
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
   Packet4f rev;
   rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
   rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
   return rev;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
   Packet4f res;
   res.v4f[0] = pabs(a.v4f[0]);
   res.v4f[1] = pabs(a.v4f[1]);
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
   Packet2d sum;
   sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
   double first = predux<Packet2d>(sum);
   return static_cast<float>(first);
 }
 
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
   // Return predux_mul<Packet2d> of the subvectors product
   return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
   Packet2d b, res;
-  b   = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(
+      b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
   return static_cast<float>(pfirst(res));
 }
 
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
   Packet2d b, res;
-  b   = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
-  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(
+      b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
   return static_cast<float>(pfirst(res));
 }
 
 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
  */
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  PacketBlock<Packet2d,2> t0,t1,t2,t3;
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  PacketBlock<Packet2d, 2> t0, t1, t2, t3;
   // copy top-left 2x2 Packet2d block
   t0.packet[0] = kernel.packet[0].v4f[0];
   t0.packet[1] = kernel.packet[1].v4f[0];
@@ -871,9 +1033,11 @@
   kernel.packet[3].v4f[1] = t3.packet[1];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
+  Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
   Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
   Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
   Packet4f result;
@@ -882,24 +1046,24 @@
   return result;
 }
 
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
   return res;
 }
 
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
   Packet4f res;
   res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
   res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
@@ -907,33 +1071,31 @@
 }
 
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
+  Packet* vfrom;
+  vfrom = (Packet*)from;
   return vfrom->v4f;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
   // FIXME: No intrinsic yet
   EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
+  Packet* vto;
+  vto = (Packet*)to;
   vto->v4f = from;
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
   return vec_splats(from);
 }
 
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
   a3 = pload<Packet4f>(a);
   a0 = vec_splat(a3, 0);
   a1 = vec_splat(a3, 1);
@@ -941,95 +1103,151 @@
   a3 = vec_splat(a3, 3);
 }
 
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
   EIGEN_ALIGN16 float af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
+  af[2] = from[2 * stride];
+  af[3] = from[3 * stride];
+  return pload<Packet4f>(af);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
   EIGEN_ALIGN16 float af[4];
   pstore<float>((float*)af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
+  to[2 * stride] = af[2];
+  to[3 * stride] = af[3];
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
+  return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  return vec_round(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x[4];
+  pstore(x, a);
+  return x[0];
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
   Packet4f p = pload<Packet4f>(from);
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return reinterpret_cast<Packet4f>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
 }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
   Packet4f b, sum;
-  b   = vec_sld(a, a, 8);
+  b = vec_sld(a, a, 8);
   sum = padd<Packet4f>(a, b);
-  b   = vec_sld(sum, sum, 4);
+  b = vec_sld(sum, sum, 4);
   sum = padd<Packet4f>(sum, b);
   return pfirst(sum);
 }
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
   Packet4f prod;
   prod = pmul(a, vec_sld(a, a, 8));
   return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
 }
 
 // min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
   Packet4f b, res;
-  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  b = pmin<Packet4f>(a, vec_sld(a, a, 8));
   res = pmin<Packet4f>(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
 // max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
   Packet4f b, res;
   b = pmax<Packet4f>(a, vec_sld(a, a, 8));
   res = pmax<Packet4f>(b, vec_sld(b, b, 4));
   return pfirst(res);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
   Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
   Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -1040,21 +1258,35 @@
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
   Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
 #endif
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  return pload<Packet4f>(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  pstore<float>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
+}
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_ZVECTOR_H
+#endif  // EIGEN_PACKET_MATH_ZVECTOR_H