Eigen/src/Core/arch/clang/Reductions.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2025 Rasmus Munk Larsen
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 #ifndef EIGEN_REDUCTIONS_CLANG_H
 #define EIGEN_REDUCTIONS_CLANG_H

 // IWYU pragma: private
 #include "../../InternalHeaderCheck.h"

 namespace Eigen {
 namespace internal {

 // --- Reductions ---
 #if EIGEN_HAS_BUILTIN(__builtin_reduce_min) && EIGEN_HAS_BUILTIN(__builtin_reduce_max) && \
     EIGEN_HAS_BUILTIN(__builtin_reduce_or)
 #define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE)                                        \
   template <>                                                                               \
   EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_min(const PACKET_TYPE& a) { \
     return __builtin_reduce_min(a);                                                         \
   }                                                                                         \
   template <>                                                                               \
   EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_max(const PACKET_TYPE& a) { \
     return __builtin_reduce_max(a);                                                         \
   }                                                                                         \
   template <>                                                                               \
   EIGEN_STRONG_INLINE bool predux_any(const PACKET_TYPE& a) {                               \
     return __builtin_reduce_or(a != 0) != 0;                                                \
   }

 EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
 EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
 EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
 EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_MINMAX
 #endif

 #if EIGEN_HAS_BUILTIN(__builtin_reduce_add) && EIGEN_HAS_BUILTIN(__builtin_reduce_mul)
 #define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE)                                                        \
   template <>                                                                                            \
   EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux<PACKET_TYPE>(const PACKET_TYPE& a) {     \
     return __builtin_reduce_add(a);                                                                      \
   }                                                                                                      \
   template <>                                                                                            \
   EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_mul<PACKET_TYPE>(const PACKET_TYPE& a) { \
     return __builtin_reduce_mul(a);                                                                      \
   }

 // __builtin_reduce_{mul,add} are only defined for integer types.
 EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
 EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_INT
 #endif

 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {

 // Reduction helpers for different vector sizes.
 // Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
     const VectorT& a) {
   return {a[0], a[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
   return {t1[0], t1[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
   const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
   return {t2[0], t2[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) +
                   __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
   const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) + __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
   const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) + __builtin_shufflevector(t2, t2, 2, 3);
   return {t3[0], t3[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
     const VectorT& a) {
   return {a[0], a[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
   return {t1[0], t1[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
   const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
   return {t2[0], t2[1]};
 }

 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul16(
     const VectorT& a) {
   const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) *
                   __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
   const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) * __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
   const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
   return {t3[0], t3[1]};
 }
 }  // namespace detail

 // --- predux and predux_mul for float ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceAdd4(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceMul4(a);
   return even * odd;
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceAdd8(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceMul8(a);
   return even * odd;
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceAdd16(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
   float even, odd;
   std::tie(even, odd) = detail::ReduceMul16(a);
   return even * odd;
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 // --- predux and predux_mul for double ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceAdd2(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceMul2(a);
   return even * odd;
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceAdd4(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceMul4(a);
   return even * odd;
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceAdd8(a);
   return even + odd;
 }
 template <>
 EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
   double even, odd;
   std::tie(even, odd) = detail::ReduceMul8(a);
   return even * odd;
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 // --- predux for complex<float> ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
   float re, im;
   std::tie(re, im) = detail::ReduceAdd4(a.v);
   return std::complex<float>(re, im);
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
   float re, im;
   std::tie(re, im) = detail::ReduceAdd8(a.v);
   return std::complex<float>(re, im);
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
   float re, im;
   std::tie(re, im) = detail::ReduceAdd16(a.v);
   return std::complex<float>(re, im);
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 // --- predux for complex<double> ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
   // 1 complex double: just return it
   return a[0];
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
   double re, im;
   std::tie(re, im) = detail::ReduceAdd4(a.v);
   return std::complex<double>(re, im);
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
   double re, im;
   std::tie(re, im) = detail::ReduceAdd8(a.v);
   return std::complex<double>(re, im);
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 // --- predux_mul for complex<float> ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
   // 2 complex floats: just multiply them
   return a[0] * a[1];
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
   // 4 complex floats: split into 2+2, multiply, then scalar multiply
   const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
   const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
   const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
   return prod2[0] * prod2[1];
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
   // 8 complex floats: 8->4->2->scalar
   const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
   const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
   const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
   const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 0, 1, 2, 3));
   const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 4, 5, 6, 7));
   const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
   return prod2[0] * prod2[1];
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 // --- predux_mul for complex<double> ---
 #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
   // 1 complex double: just return it
   return a[0];
 }

 #elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
   // 2 complex doubles: just multiply them
   return a[0] * a[1];
 }

 #else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
   // 4 complex doubles: split into 2+2, multiply, then scalar multiply
   const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
   const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
   const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
   return prod2[0] * prod2[1];
 }

 #endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 #endif

 }  // end namespace internal
 }  // end namespace Eigen

 #endif  // EIGEN_REDUCTIONS_CLANG_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2025 Rasmus Munk Larsen
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	#ifndef EIGEN_REDUCTIONS_CLANG_H
	#define EIGEN_REDUCTIONS_CLANG_H

	// IWYU pragma: private
	#include "../../InternalHeaderCheck.h"

	namespace Eigen {
	namespace internal {

	// --- Reductions ---
	#if EIGEN_HAS_BUILTIN(__builtin_reduce_min) && EIGEN_HAS_BUILTIN(__builtin_reduce_max) && \
	EIGEN_HAS_BUILTIN(__builtin_reduce_or)
	#define EIGEN_CLANG_PACKET_REDUX_MINMAX(PACKET_TYPE) \
	template <> \
	EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_min(const PACKET_TYPE& a) { \
	return __builtin_reduce_min(a); \
	} \
	template <> \
	EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_max(const PACKET_TYPE& a) { \
	return __builtin_reduce_max(a); \
	} \
	template <> \
	EIGEN_STRONG_INLINE bool predux_any(const PACKET_TYPE& a) { \
	return __builtin_reduce_or(a != 0) != 0; \
	}

	EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
	EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
	EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
	EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
	#undef EIGEN_CLANG_PACKET_REDUX_MINMAX
	#endif

	#if EIGEN_HAS_BUILTIN(__builtin_reduce_add) && EIGEN_HAS_BUILTIN(__builtin_reduce_mul)
	#define EIGEN_CLANG_PACKET_REDUX_INT(PACKET_TYPE) \
	template <> \
	EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux<PACKET_TYPE>(const PACKET_TYPE& a) { \
	return __builtin_reduce_add(a); \
	} \
	template <> \
	EIGEN_STRONG_INLINE unpacket_traits<PACKET_TYPE>::type predux_mul<PACKET_TYPE>(const PACKET_TYPE& a) { \
	return __builtin_reduce_mul(a); \
	}

	// __builtin_reduce_{mul,add} are only defined for integer types.
	EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
	EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
	#undef EIGEN_CLANG_PACKET_REDUX_INT
	#endif

	#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
	namespace detail {

	// Reduction helpers for different vector sizes.
	// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
	const VectorT& a) {
	return {a[0], a[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
	return {t1[0], t1[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
	const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
	return {t2[0], t2[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) +
	__builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
	const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) + __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
	const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) + __builtin_shufflevector(t2, t2, 2, 3);
	return {t3[0], t3[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
	const VectorT& a) {
	return {a[0], a[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
	return {t1[0], t1[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
	const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
	return {t2[0], t2[1]};
	}

	template <typename VectorT>
	EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul16(
	const VectorT& a) {
	const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7) *
	__builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15);
	const auto t2 = __builtin_shufflevector(t1, t1, 0, 1, 2, 3) * __builtin_shufflevector(t1, t1, 4, 5, 6, 7);
	const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
	return {t3[0], t3[1]};
	}
	} // namespace detail

	// --- predux and predux_mul for float ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceAdd4(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceMul4(a);
	return even * odd;
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceAdd8(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceMul8(a);
	return even * odd;
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceAdd16(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
	float even, odd;
	std::tie(even, odd) = detail::ReduceMul16(a);
	return even * odd;
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	// --- predux and predux_mul for double ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceAdd2(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceMul2(a);
	return even * odd;
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceAdd4(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceMul4(a);
	return even * odd;
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceAdd8(a);
	return even + odd;
	}
	template <>
	EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
	double even, odd;
	std::tie(even, odd) = detail::ReduceMul8(a);
	return even * odd;
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	// --- predux for complex<float> ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
	float re, im;
	std::tie(re, im) = detail::ReduceAdd4(a.v);
	return std::complex<float>(re, im);
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
	float re, im;
	std::tie(re, im) = detail::ReduceAdd8(a.v);
	return std::complex<float>(re, im);
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
	float re, im;
	std::tie(re, im) = detail::ReduceAdd16(a.v);
	return std::complex<float>(re, im);
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	// --- predux for complex<double> ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
	// 1 complex double: just return it
	return a[0];
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
	double re, im;
	std::tie(re, im) = detail::ReduceAdd4(a.v);
	return std::complex<double>(re, im);
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
	double re, im;
	std::tie(re, im) = detail::ReduceAdd8(a.v);
	return std::complex<double>(re, im);
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	// --- predux_mul for complex<float> ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
	// 2 complex floats: just multiply them
	return a[0] * a[1];
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
	// 4 complex floats: split into 2+2, multiply, then scalar multiply
	const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
	const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
	const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
	return prod2[0] * prod2[1];
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
	// 8 complex floats: 8->4->2->scalar
	const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
	const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
	const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
	const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 0, 1, 2, 3));
	const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(prod4.v, prod4.v, 4, 5, 6, 7));
	const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
	return prod2[0] * prod2[1];
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	// --- predux_mul for complex<double> ---
	#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
	// 1 complex double: just return it
	return a[0];
	}

	#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
	// 2 complex doubles: just multiply them
	return a[0] * a[1];
	}

	#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64

	template <>
	EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
	// 4 complex doubles: split into 2+2, multiply, then scalar multiply
	const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
	const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
	const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
	return prod2[0] * prod2[1];
	}

	#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES

	#endif

	} // end namespace internal
	} // end namespace Eigen

	#endif // EIGEN_REDUCTIONS_CLANG_H